torchtext 埋め込みベクトルのサイズ指定について
やりたいこと
Text.vocabのサイズが教師データの語彙数に依存してしまい、推定用のデータを利用する際に
新たに埋め込みベクトルを生成すると入力層の次元数が合わなくなるので
入力のベクトルファイル(model.vec)を基準に次元数を指定したいです
環境
colaboratory Python3 GPU ランタイム
pytorch 1.1.0
torchtext 0.4.0
入力データ
train_ja.tsv(val_ja.tsv,test_ja.tsvも一緒の形式)
あなたをが好きです。 1
私はマイクが嫌いです。 0
私はマキが好きです。 1
ボブが嫌いです。 0
model.vec … FastTextの学習済みベクトル
ソースコード
model.py
# coding:utf-8
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import FastText
from torchtext.vocab import Vectors
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
class EncoderRNN(nn.Module):
def __init__(self, emb_dim, h_dim, v_size, gpu=True, v_vec=None, batch_first=True):
super(EncoderRNN, self).__init__()
self.gpu = gpu
self.h_dim = h_dim
self.embed = nn.Embedding(v_size, emb_dim)
if v_vec is not None:
self.embed.weight.data.copy_(v_vec)
self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first,
bidirectional=True)
def init_hidden(self, b_size):
h0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
c0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
if self.gpu:
h0 = h0.cuda()
c0 = c0.cuda()
return (h0, c0)
def forward(self, sentence, lengths=None):
self.hidden = self.init_hidden(sentence.size(0))
emb = self.embed(sentence)
packed_emb = emb
if lengths is not None:
lengths = lengths.view(-1).tolist()
packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)
out, hidden = self.lstm(packed_emb, self.hidden)
if lengths is not None:
out = nn.utils.rnn.pad_packed_sequence(output)[0]
out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]
return out
class Attn(nn.Module):
def __init__(self, h_dim):
super(Attn, self).__init__()
self.h_dim = h_dim
self.main = nn.Sequential(
nn.Linear(h_dim, 24),
nn.ReLU(True),
nn.Linear(24,1)
)
def forward(self, encoder_outputs):
b_size = encoder_outputs.size(0)
output_cont = encoder_outputs.contiguous()
output_view = output_cont.view(-1, self.h_dim)
attn_ene = self.main(output_view.to("cpu")) # (b, s, h) -> (b * s, 1)
return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)
class AttnClassifier(nn.Module):
def __init__(self, h_dim, c_num):
super(AttnClassifier, self).__init__()
self.attn = Attn(h_dim)
self.main = nn.Linear(h_dim, c_num)
def forward(self, encoder_outputs):
attns = self.attn(encoder_outputs) #(b, s, 1)
feats = (encoder_outputs.to("cuda:0") * attns.to("cuda:0")).sum(dim=1) # (b, s, h) -> (b, h)
return F.log_softmax(self.main(feats.to("cpu")),dim=1), attns
train.py
# coding:utf-8
import janome
from janome.tokenizer import Tokenizer
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torchtext import data, datasets
from bs4 import BeautifulSoup
import pandas as pd
import io
#from model import *
emb_dim = 300 #単語埋め込み次元
h_dim = 3 #lstmの隠れ層の次元
class_num = 2 #予測クラス数
lr = 0.001 #学習係数
epochs = 30 #エポック数
#device = torch.device('cuda:0')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
j_t = Tokenizer()
def tokenizer(text):
#return text.split(' ')
return [tok for tok in j_t.tokenize(text, wakati=True)]
def clean_tokenizer(text):
soup = BeautifulSoup(text,"lxml")
clean_text = soup.get_text()
return [tok for tok in j_t.tokenize(clean_text, wakati=True)]
class JaFastText(Vectors):
def __init__(self, name=None, **kwargs):
super(JaFastText, self).__init__(name, url=None, **kwargs)
def train_model(epoch, train_iter, optimizer, log_interval=1, batch_size=2):
encoder.train()
classifier.train()
correct = 0
for idx, batch in enumerate(train_iter):
(x, x_l), y = batch.Text, batch.Label
optimizer.zero_grad()
encoder_outputs = encoder(x)
output, attn = classifier(encoder_outputs)
loss = F.nll_loss(output.to(device), y.to(device))
loss.backward()
optimizer.step()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
if idx % log_interval == 0:
print('train epoch: {} [{}/{}], acc:{}, loss:{}'.format(
epoch, (idx+1)*len(x), len(train_iter)*batch_size,
correct/float(log_interval * len(x)),
loss.item()))
correct = 0
def test_model(epoch, test_iter):
encoder.eval()
classifier.eval()
correct = 0
for idx, batch in enumerate(test_iter):
(x, x_l), y = batch.Text, batch.Label
encoder_outputs = encoder(x)
output, attn = classifier(encoder_outputs)
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
print('test epoch:{}, acc:{}'.format(epoch, correct/float(len(test))))
# init model
def weights_init(m):
classname = m.__class__.__name__
if hasattr(m, 'weight') and (classname.find('Embedding') == -1):
nn.init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))
if __name__ == '__main__':
#TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False)
train, val, test = data.TabularDataset.splits(
path='./', train='train_ja.tsv',
validation='val_ja.tsv', test='test_ja.tsv', format='tsv',
fields=[('Text', TEXT), ('Label', LABEL)])
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))
fasttext = JaFastText(name='model.vec')
TEXT.build_vocab(train, vectors=fasttext, min_freq=1)
TEXT.vocab.freqs
TEXT.vocab.stoi
TEXT.vocab.itos
TEXT.vocab.vectors.size()
torch.save(TEXT,"TEXT.pkl")
train_iter, val_iter, test_iter = data.Iterator.splits(
(train, val, test), batch_sizes=(2, 2, 1), device=device, repeat=False,sort=False)
batch = next(iter(train_iter))
print(batch.Text)
print(batch.Label)
# make model
encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab),gpu=True, v_vec = TEXT.vocab.vectors)
encoder.cuda()
classifier = AttnClassifier(h_dim, class_num)
for m in encoder.modules():
print(m.__class__.__name__)
weights_init(m)
for m in classifier.modules():
print(m.__class__.__name__)
weights_init(m)
# optim
from itertools import chain
optimizer = optim.Adam(chain(encoder.parameters(),classifier.parameters()), lr=lr)
# train model
for epoch in range(epochs):
train_model(epoch + 1, train_iter, optimizer)
test_model(epoch + 1, val_iter)
#torch.save(encoder.state_dict(), "model/encoder_epoch"+ str(epoch + 1) +".pkl")
#torch.save(classifier.state_dict(), "model/classifier_epoch"+ str(epoch + 1) +".pkl")
# save model
torch.save(encoder.state_dict(),"encoder.pkl")
torch.save(classifier.state_dict(),"classifier.pkl")