Artificial Intelligence Lab Work (5)
レポート解答用紙 (Report Answer Sheet)
学生証番号 (Student ID): 22520205
名前(Name): Cao Thành Đạt (Cao Thanh Dat/カオ・タイン・ダット)
問題 1.
(プログラム)
!pip install torchtext==0.17.0
!pip install portalocker
import torch
import [Link] as F
import torchtext
train_iter, test_iter = [Link](split=('train', 'test'))
tokenizer = [Link].get_tokenizer('basic_english')
MODELNAME = "[Link]"
EPOCH = 10
BATCHSIZE = 64
LR = 1e-5
DEVICE = "cuda" if [Link].is_available() else "cpu"
print(DEVICE)
train_data = [(label, tokenizer(line)) for label, line in train_iter]
train_data.sort(key = lambda x: len(x[1]))
test_data = [(label, tokenizer(line)) for label, line in test_iter]
test_data.sort(key = lambda x: len(x[1]))
for i in range(10):
print(train_data[i])
def make_vocab(train_data, min_freq):
vocab = {}
for label, tokenlist in train_data:
for token in tokenlist:
if token not in vocab:
vocab[token] = 0
vocab[token] += 1
vocablist = [('<unk>', 0), ('<pad>', 0), ('<cls>', 0), ('<eos>', 0)]
vocabidx = {}
for token, freq in [Link]():
if freq >= min_freq:
idx = len(vocabidx)
[Link]((token, freq))
vocabidx[token] = idx
vocabidx['<unk>'] = 0
vocabidx['<pad>'] = 1
vocabidx['<cls>'] = 2
vocabidx['<eos>'] = 3
return vocablist, vocabidx
vocablist, vocabidx = make_vocab(train_data, 10)
def preprocess(data, vocabidx):
rr = []
for label, tokenlist in data:
tkl = ['<cls>']
for token in tokenlist:
[Link](token if token in vocabidx else '<unk>')
[Link]('<eos>')
[Link]((label, tkl))
return rr
train_data = preprocess(train_data, vocabidx)
test_data = preprocess(test_data, vocabidx)
for i in range(10):
print(train_data[i])
def make_batch(data, batchsize):
bb = []
blabel = []
btokenlist = []
for label, tokenlist in data:
[Link](label)
[Link](tokenlist)
if len(blabel) >= batchsize:
[Link]((btokenlist, blabel))
blabel = []
btokenlist = []
if len(blabel) > 0:
[Link]((btokenlist, blabel))
return bb
train_data = make_batch(train_data, BATCHSIZE)
test_data = make_batch(test_data, BATCHSIZE)
for i in range(10):
print(train_data[i])
def padding(bb):
for tokenlists, labels in bb:
maxlen = max([len(x) for x in tokenlists])
for tkl in tokenlists:
for i in range(maxlen - len(tkl)):
[Link]('<pad>')
return bb
train_data = padding(train_data)
test_data = padding(test_data)
for i in range(10):
print(train_data[i])
def word2id(bb, vocabidx):
rr = []
for tokenlists, labels in bb:
id_labels = [label - 1 for label in labels]
id_tokenlists = []
for tokenlist in tokenlists:
id_tokenlists.append([vocabidx[token] for token in tokenlist])
[Link]([id_tokenlists, id_labels])
return rr
train_data = word2id(train_data, vocabidx)
test_data = word2id(test_data, vocabidx)
for i in range(10):
print(train_data[i])
class MyRNN([Link]):
def __init__(self):
super(MyRNN, self).__init__()
vocabsize = len(vocablist)
[Link] = [Link](vocabsize, 300, padding_idx =
vocabidx['<pad>'])
self.l1 = [Link](300, 300)
self.l2 = [Link](300, 2)
def forward(self, x):
e = [Link](x)
h = [Link](e[0].size(), dtype = torch.float32).to(DEVICE)
for i in range([Link]()[0]):
h = [Link](e[i] + self.l1(h))
return self.l2(h)
def train():
model = MyRNN().to(DEVICE)
optimizer = [Link]([Link](), lr = LR)
for epoch in range(EPOCH):
loss = 0
for tokenlists, labels in train_data:
tokenlists = [Link](tokenlists, dtype = torch.int64).transpose(0,
1).to(DEVICE)
labels = [Link](labels, dtype = torch.int64).to(DEVICE)
optimizer.zero_grad()
y = model(tokenlists)
batchloss = F.cross_entropy(y, labels)
[Link]()
[Link]()
loss = loss + [Link]()
print("epoch: ", epoch, "loss: ", loss)
[Link](model.state_dict(), MODELNAME)
def test():
total = 0
correct = 0
model = MyRNN().to(DEVICE)
model.load_state_dict([Link](MODELNAME))
[Link]()
for tokenlists, labels in test_data:
total += len(labels)
tokenlists = [Link](tokenlists, dtype = torch.int64).transpose(0,
1).to(DEVICE)
labels = [Link](labels, dtype = torch.int64).to(DEVICE)
y = model(tokenlists)
pred_labels = [Link](dim=1)[1]
correct += (pred_labels == labels).sum()
print("correct:", [Link]())
print("total:", total)
print("accuracy: ", ([Link]() / float(total)))
train()
test()
(実行結果)