0% found this document useful (0 votes)
2 views7 pages

Lab 5

The document details a Python implementation for training a Recurrent Neural Network (RNN) using the IMDB dataset for sentiment analysis. It includes steps for data preprocessing, vocabulary creation, batching, padding, and model training/testing. The model's performance is evaluated based on accuracy after training.

Uploaded by

Thành Đạt D1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views7 pages

Lab 5

The document details a Python implementation for training a Recurrent Neural Network (RNN) using the IMDB dataset for sentiment analysis. It includes steps for data preprocessing, vocabulary creation, batching, padding, and model training/testing. The model's performance is evaluated based on accuracy after training.

Uploaded by

Thành Đạt D1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

Artificial Intelligence Lab Work (5)

レポート解答用紙 (Report Answer Sheet)

学生証番号 (Student ID): 22520205


名前(Name): Cao Thành Đạt (Cao Thanh Dat/カオ・タイン・ダット)

問題 1.
(プログラム)
!pip install torchtext==0.17.0

!pip install portalocker

import torch

import torch.nn.functional as F

import torchtext

train_iter, test_iter = torchtext.datasets.IMDB(split=('train', 'test'))

tokenizer = torchtext.data.utils.get_tokenizer('basic_english')

MODELNAME = "imdb-rnn.model"

EPOCH = 10

BATCHSIZE = 64

LR = 1e-5

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(DEVICE)

train_data = [(label, tokenizer(line)) for label, line in train_iter]

train_data.sort(key = lambda x: len(x[1]))

test_data = [(label, tokenizer(line)) for label, line in test_iter]

test_data.sort(key = lambda x: len(x[1]))

for i in range(10):

print(train_data[i])

def make_vocab(train_data, min_freq):

vocab = {}
for label, tokenlist in train_data:

for token in tokenlist:

if token not in vocab:

vocab[token] = 0

vocab[token] += 1

vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]

vocabidx = {}

for token, freq in vocab.items():

if freq >= min_freq:

idx = len(vocabidx)

vocablist.append((token, freq))

vocabidx[token] = idx

vocabidx['<unk>'] = 0

vocabidx['<pad>'] = 1

vocabidx['<cls>'] = 2

vocabidx['<eos>'] = 3

return vocablist, vocabidx

vocablist, vocabidx = make_vocab(train_data, 10)

def preprocess(data, vocabidx):

rr = []

for label, tokenlist in data:

tkl = ['<cls>']

for token in tokenlist:

tkl.append(token if token in vocabidx else '<unk>')

tkl.append('<eos>')

rr.append((label, tkl))

return rr

train_data = preprocess(train_data, vocabidx)

test_data = preprocess(test_data, vocabidx)


for i in range(10):

print(train_data[i])

def make_batch(data, batchsize):

bb = []

blabel = []

btokenlist = []

for label, tokenlist in data:

blabel.append(label)

btokenlist.append(tokenlist)

if len(blabel) >= batchsize:

bb.append((btokenlist, blabel))

blabel = []

btokenlist = []

if len(blabel) > 0:

bb.append((btokenlist, blabel))

return bb

train_data = make_batch(train_data, BATCHSIZE)

test_data = make_batch(test_data, BATCHSIZE)

for i in range(10):

print(train_data[i])

def padding(bb):

for tokenlists, labels in bb:

maxlen = max([len(x) for x in tokenlists])

for tkl in tokenlists:

for i in range(maxlen - len(tkl)):

tkl.append('<pad>')

return bb

train_data = padding(train_data)
test_data = padding(test_data)

for i in range(10):

print(train_data[i])

def word2id(bb, vocabidx):

rr = []

for tokenlists, labels in bb:

id_labels = [label - 1 for label in labels]

id_tokenlists = []

for tokenlist in tokenlists:

id_tokenlists.append([vocabidx[token] for token in tokenlist])

rr.append([id_tokenlists, id_labels])

return rr

train_data = word2id(train_data, vocabidx)

test_data = word2id(test_data, vocabidx)

for i in range(10):

print(train_data[i])

class MyRNN(torch.nn.Module):

def __init__(self):

super(MyRNN, self).__init__()

vocabsize = len(vocablist)

self.emb = torch.nn.Embedding(vocabsize, 300, padding_idx =

vocabidx['<pad>'])

self.l1 = torch.nn.Linear(300, 300)

self.l2 = torch.nn.Linear(300, 2)

def forward(self, x):

e = self.emb(x)

h = torch.zeros(e[0].size(), dtype = torch.float32).to(DEVICE)

for i in range(x.size()[0]):

h = F.relu(e[i] + self.l1(h))
return self.l2(h)

def train():

model = MyRNN().to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr = LR)

for epoch in range(EPOCH):

loss = 0

for tokenlists, labels in train_data:

tokenlists = torch.tensor(tokenlists, dtype = torch.int64).transpose(0,

1).to(DEVICE)

labels = torch.tensor(labels, dtype = torch.int64).to(DEVICE)

optimizer.zero_grad()

y = model(tokenlists)

batchloss = F.cross_entropy(y, labels)

batchloss.backward()

optimizer.step()

loss = loss + batchloss.item()

print("epoch: ", epoch, "loss: ", loss)

torch.save(model.state_dict(), MODELNAME)

def test():

total = 0

correct = 0

model = MyRNN().to(DEVICE)

model.load_state_dict(torch.load(MODELNAME))

model.eval()

for tokenlists, labels in test_data:

total += len(labels)

tokenlists = torch.tensor(tokenlists, dtype = torch.int64).transpose(0,

1).to(DEVICE)

labels = torch.tensor(labels, dtype = torch.int64).to(DEVICE)

y = model(tokenlists)
pred_labels = y.max(dim=1)[1]

correct += (pred_labels == labels).sum()

print("correct:", correct.item())

print("total:", total)

print("accuracy: ", (correct.item() / float(total)))

train()

test()
(実行結果)

You might also like

pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy