やりたいこと

Text.vocabのサイズが教師データの語彙数に依存してしまい、推定用のデータを利用する際に
新たに埋め込みベクトルを生成すると入力層の次元数が合わなくなるので
入力のベクトルファイル(model.vec)を基準に次元数を指定したいです

環境

colaboratory Python3 GPU ランタイム
pytorch 1.1.0
torchtext 0.4.0

入力データ

train_ja.tsv(val_ja.tsv,test_ja.tsvも一緒の形式)

あなたをが好きです。  1
私はマイクが嫌いです。 0
私はマキが好きです。  1
ボブが嫌いです。    0

model.vec … FastTextの学習済みベクトル

ソースコード

model.py

# coding:utf-8
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import FastText
from torchtext.vocab import Vectors

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

class EncoderRNN(nn.Module):
    def __init__(self, emb_dim, h_dim, v_size, gpu=True, v_vec=None, batch_first=True):
        super(EncoderRNN, self).__init__()
        self.gpu = gpu
        self.h_dim = h_dim
        self.embed = nn.Embedding(v_size, emb_dim)
        if v_vec is not None:
            self.embed.weight.data.copy_(v_vec)
        self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first,
                            bidirectional=True)

    def init_hidden(self, b_size):
        h0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
        c0 = Variable(torch.zeros(1*2, b_size, self.h_dim))
        if self.gpu:
            h0 = h0.cuda()
            c0 = c0.cuda()
        return (h0, c0)

    def forward(self, sentence, lengths=None):
        self.hidden = self.init_hidden(sentence.size(0))
        emb = self.embed(sentence)
        packed_emb = emb

        if lengths is not None:
            lengths = lengths.view(-1).tolist()
            packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)
        out, hidden = self.lstm(packed_emb, self.hidden)
        if lengths is not None:
            out = nn.utils.rnn.pad_packed_sequence(output)[0]
        out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]
        return out

class Attn(nn.Module):
    def __init__(self, h_dim):
        super(Attn, self).__init__()
        self.h_dim = h_dim
        self.main = nn.Sequential(
            nn.Linear(h_dim, 24),
            nn.ReLU(True),
            nn.Linear(24,1)
        )

    def forward(self, encoder_outputs):
        b_size = encoder_outputs.size(0)
        output_cont = encoder_outputs.contiguous()
        output_view = output_cont.view(-1, self.h_dim)
        attn_ene = self.main(output_view.to("cpu")) # (b, s, h) -> (b * s, 1)
        return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)

class AttnClassifier(nn.Module):
    def __init__(self, h_dim, c_num):
        super(AttnClassifier, self).__init__()
        self.attn = Attn(h_dim)
        self.main = nn.Linear(h_dim, c_num)


    def forward(self, encoder_outputs):
        attns = self.attn(encoder_outputs) #(b, s, 1)
        feats = (encoder_outputs.to("cuda:0") * attns.to("cuda:0")).sum(dim=1) # (b, s, h) -> (b, h)
        return F.log_softmax(self.main(feats.to("cpu")),dim=1), attns

train.py

# coding:utf-8
import janome
from janome.tokenizer import Tokenizer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

from torchtext import data, datasets

from bs4 import BeautifulSoup
import pandas as pd
import io

#from model import *

emb_dim = 300 #単語埋め込み次元
h_dim = 3 #lstmの隠れ層の次元
class_num = 2 #予測クラス数
lr = 0.001 #学習係数
epochs = 30 #エポック数

#device = torch.device('cuda:0')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#print(device)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
j_t = Tokenizer()

def tokenizer(text): 
    #return text.split(' ')
    return [tok for tok in j_t.tokenize(text, wakati=True)]

def clean_tokenizer(text):
    soup = BeautifulSoup(text,"lxml")
    clean_text = soup.get_text()
    return [tok for tok in j_t.tokenize(clean_text, wakati=True)]

class JaFastText(Vectors):
    def __init__(self, name=None, **kwargs):
        super(JaFastText, self).__init__(name, url=None, **kwargs)


def train_model(epoch, train_iter, optimizer, log_interval=1, batch_size=2):
    encoder.train()
    classifier.train()
    correct = 0
    for idx, batch in enumerate(train_iter):
        (x, x_l), y = batch.Text, batch.Label
        optimizer.zero_grad()
        encoder_outputs = encoder(x)
        output, attn = classifier(encoder_outputs)
        loss = F.nll_loss(output.to(device), y.to(device))
        loss.backward()
        optimizer.step()
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
        if idx % log_interval == 0:
            print('train epoch: {} [{}/{}], acc:{}, loss:{}'.format(
            epoch, (idx+1)*len(x), len(train_iter)*batch_size,
            correct/float(log_interval * len(x)),
            loss.item()))
            correct = 0


def test_model(epoch, test_iter):
    encoder.eval()
    classifier.eval()
    correct = 0
    for idx, batch in enumerate(test_iter):
        (x, x_l), y = batch.Text, batch.Label
        encoder_outputs = encoder(x)
        output, attn = classifier(encoder_outputs)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(y.data.view_as(pred).to("cpu")).cpu().sum()
    print('test epoch:{}, acc:{}'.format(epoch, correct/float(len(test))))

# init model
def weights_init(m):
    classname = m.__class__.__name__
    if hasattr(m, 'weight') and (classname.find('Embedding') == -1):
        nn.init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))

if __name__ == '__main__':
    #TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
    TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, batch_first=True)
    LABEL = data.Field(sequential=False, use_vocab=False)
    train, val, test = data.TabularDataset.splits(
            path='./', train='train_ja.tsv',
            validation='val_ja.tsv', test='test_ja.tsv', format='tsv',
            fields=[('Text', TEXT), ('Label', LABEL)])

    print('len(train)', len(train))
    print('vars(train[0])', vars(train[0]))

    fasttext = JaFastText(name='model.vec')
    TEXT.build_vocab(train, vectors=fasttext, min_freq=1)
    TEXT.vocab.freqs
    TEXT.vocab.stoi
    TEXT.vocab.itos
    TEXT.vocab.vectors.size()
    torch.save(TEXT,"TEXT.pkl") 

    train_iter, val_iter, test_iter = data.Iterator.splits(
        (train, val, test), batch_sizes=(2, 2, 1), device=device, repeat=False,sort=False)
    batch = next(iter(train_iter))
    print(batch.Text)
    print(batch.Label)

    # make model
    encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab),gpu=True, v_vec = TEXT.vocab.vectors)
    encoder.cuda()
    classifier = AttnClassifier(h_dim, class_num)

    for m in encoder.modules():
        print(m.__class__.__name__)
        weights_init(m)

    for m in classifier.modules():
        print(m.__class__.__name__)
        weights_init(m)

    # optim
    from itertools import chain
    optimizer = optim.Adam(chain(encoder.parameters(),classifier.parameters()), lr=lr)

    # train model
    for epoch in range(epochs):
        train_model(epoch + 1, train_iter, optimizer)
        test_model(epoch + 1, val_iter)
        #torch.save(encoder.state_dict(), "model/encoder_epoch"+ str(epoch + 1) +".pkl")
        #torch.save(classifier.state_dict(), "model/classifier_epoch"+ str(epoch + 1) +".pkl")

    # save model
    torch.save(encoder.state_dict(),"encoder.pkl")
    torch.save(classifier.state_dict(),"classifier.pkl")