Advertisement

Pytorch-seq2seq机器翻译模型+attention

阅读量:

笔记摘抄

语料链接:https://pan.baidu.com/s/1wpP4t_GSyPAD6HTsIoGPZg
提取码:jqq8

数据格式如图:

导包:

复制代码
 import os

    
 import sys
    
 import math
    
 from collections import Counter
    
 import numpy as np
    
 import random
    
  
    
 import torch
    
 import torch.nn as nn
    
 import torch.nn.functional as F
    
  
    
 import nltk
    
 nltk.download('punkt')

1. 数据预处理

1.1 读入中英文数据

英文使用nltk的 word_tokenizer 来分词,并且使用小写字母

中文直接使用单个汉字作为基本单元

复制代码
 def load_data(in_file):

    
     cn = []
    
     en = []
    
     num_examples = 0
    
     with open(in_file, 'r', encoding='utf8') as f:
    
     for line in f:
    
         line = line.strip().split('\t')
    
         
    
         en.append(['BOS'] + nltk.word_tokenize(line[0].lower()) + ['EOS'])
    
         cn.append(['BOS'] + [c for c in line[1]] + ['EOS'])
    
     
    
     return en, cn
    
  
    
 train_file = 'nmt/en-cn/train.txt'
    
 dev_file = 'nmt/en-cn/dev.txt'
    
 train_en, train_cn = load_data(train_file)
    
 dev_en, dev_cn = load_data(dev_file)

查看返回的数据内容:

复制代码
 print(dev_en[:2])

    
 print(dev_cn[:2])

[['BOS', 'she', 'put', 'the', 'magazine', 'on', 'the', 'table', '.', 'EOS'], ['BOS', 'hey', ',', 'what', 'are', 'you', 'doing', 'here', '?', 'EOS']]
[['BOS', '她', '把', '雜', '誌', '放', '在', '桌', '上', '。', 'EOS'], ['BOS', '嘿', ',', '你', '在', '這', '做', '什', '麼', '?', 'EOS']]

1.2 构建单词表

复制代码
 UNK_IDX = 0

    
 PAD_IDX = 1
    
 def build_dict(sentences, max_words = 50000):
    
     word_count = Counter()
    
     for sentence in sentences:
    
     for word in sentence:
    
         word_count[word] += 1
    
     
    
     ls = word_count.most_common(max_words)   # 词频前max_words个单词(降序)
    
     total_words = len(ls) + 2
    
     
    
     word_dict = {w[0] : index + 2 for index, w in enumerate(ls)}  # {单词:索引}, w[0]:单词, w[1]:词频
    
     word_dict['UNK'] = UNK_IDX
    
     word_dict['PAD'] = PAD_IDX
    
     
    
     return word_dict, total_words           # total_words所有单词数, 最大50002
    
  
    
 en_dict, en_total_words = build_dict(train_en)
    
 cn_dict, cn_total_words = build_dict(train_cn)
    
 inv_en_dict = {v: k for k, v in en_dict.items()}  # 英文; {索引 : 单词}
    
 inv_cn_dict = {v: k for k, v in cn_dict.items()}  # 中文; {索引 : 字}

1.3 把单词全部转变成数字

sort_by_len=True :是为了使得一个batch中的句子长度差不多,所以按长度排序。

复制代码
 def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True):

    
     length = len(en_sentences)
    
     out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences]
    
     out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences]
    
  
    
     # sort sentences by word 
    
     def len_argsort(seq):
    
     return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    
  
    
     # 把中文和英文按照同样的顺序排序
    
     if sort_by_len:
    
     sorted_index = len_argsort(out_en_sentences)
    
     out_en_sentences = [out_en_sentences[i] for i in sorted_index]
    
     out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]
    
     
    
     return out_en_sentences, out_cn_sentences
    
  
    
 train_en, train_cn = encode(train_en, train_cn, en_dict, cn_dict)
    
 dev_en, dev_cn = encode(dev_en, dev_cn, en_dict, cn_dict)  # [[2, 168, 201, 4, 3], [], ...., [2, 5, 14, 13, 22, 9, 149, 17, 107, 24, 121, 16, 20, 267, 7, 181, 23, 15, 6, 422, 25, 220, 4, 3]]

查看返回的数据内容:

复制代码
 print(train_cn[2])

    
 print([inv_cn_dict[i] for i in train_cn[2]])
    
 print([inv_en_dict[i] for i in train_en[2]])

[2, 982, 2028, 8, 4, 3]
['BOS', '祝', '贺', '你', '。', 'EOS']
['BOS', 'congratulations', '!', 'EOS']

1.4 把全部句子分成batch

复制代码
 # 函数返回:一个minibatch,每个句子的索引, [[11, 4, 3, 5], [16, 7, 5, 7], ...]]

    
 def get_minibatches(n, minibatch_size, shuffle=True):  # n是传进来的句子数
    
     idx_list = np.arange(0, n, minibatch_size)         # [0, 1, ..., n-1] 按minibatch_size大小分割
    
     if shuffle:
    
     np.random.shuffle(idx_list)
    
     minibatches = []
    
     for idx in idx_list:
    
     minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    
     return minibatches

查看上面函数的功能:

复制代码
    get_minibatches(100, 15)

[array([90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
array([60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74]),
array([15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]),
array([30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44]),
array([75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
array([45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59])]

复制代码
 # seqs传入的是minibatches中的一个minibatch对应的batch_size个句子索引(嵌套列表),此处batch_size=64

    
 def prepare_data(seqs):   
    
     lengths = [len(seq) for seq in seqs]
    
     n_samples = len(seqs)                  # n_samples个句子
    
     max_len = np.max(lengths)              # batch_size个句子中最长句子长度
    
  
    
     x = np.zeros((n_samples, max_len)).astype('int32')
    
     x_lengths = np.array(lengths).astype('int32') # batch中原始句子长度
    
  
    
     for idx, seq in enumerate(seqs):
    
     x[idx, :lengths[idx]] = seq   # lengths[idx]: 每个句子的索引, 长度不够补0
    
     
    
     return x, x_lengths
    
  
    
 def gen_examples(en_sentences, cn_sentences, batch_size):
    
     minibatches = get_minibatches(len(en_sentences), batch_size)    
    
     all_ex = []
    
     for minibatch in minibatches:
    
     mb_en_sentences = [en_sentences[t] for t in minibatch]  # 一个batch中每个句子的对应编码,[[[2, 982, 8], [14,5,6],...]
    
     mb_cn_sentences = [cn_sentences[t] for t in minibatch]
    
     mb_x, mb_x_len = prepare_data(mb_en_sentences)          # 一个batch中每个句子的对应编码,长度不够补0; 一个batch中每个句子长度
    
     mb_y, mb_y_len = prepare_data(mb_cn_sentences)
    
     all_ex.append((mb_x, mb_x_len, mb_y, mb_y_len))
    
     # 返回内容依次是 n / batch_size 个 (batch个句子编码,batch个英文句子长度,batch个中文句子编码,batch个中文句子长度)
    
     return all_ex   
    
  
    
 batch_size = 64
    
 train_data = gen_examples(train_en, train_cn, batch_size)
    
 dev_data = gen_examples(dev_en, dev_cn, batch_size)

2. Encoder Decoder模型(没有Attention版本)

2.1 定义计算损失的函数

复制代码
 # masked cross entropy loss

    
 class LanguageModelCriterion(nn.Module):
    
     def __init__(self):
    
     super(LanguageModelCriterion, self).__init__()
    
  
    
     def forward(self, input, target, mask):
    
     # input: [64, 12, 3195] target: [64, 12]  mask: [64, 12]
    
     # input: (batch_size * seq_len) * vocab_size
    
     input = input.contiguous().view(-1, input.size(2))
    
     # target: batch_size * seq_len
    
     target = target.contiguous().view(-1, 1)
    
     mask = mask.contiguous().view(-1, 1)
    
     output = -input.gather(1, target) * mask  # 将input在1维,把target当索引进行取值
    
     #这里算得就是交叉熵损失,前面已经算了F.log_softmax
    
     #output.shape=torch.Size([768, 1])
    
     #因为input.gather时,target为0的地方不是零了,mask作用是把padding为0的地方重置为零,
    
     #因为在volab里0代表的也是一个单词,但是我们这里target尾部的0代表的不是单词
    
     output = torch.sum(output) / torch.sum(mask)
    
     # 均值损失,output前已经加了负号,所以这里还是最小化
    
     return output

2.2 Encoder部分

Encoder模型的任务是把输入文字传入embedding层和GRU层,转换成一些hidden states作为后续的context vectors;

nn.utils.rnn.pack_padded_sequencenn.utils.rnn.pad_packed_sequence 的理解:http://www.mamicode.com/info-detail-2493083.html

复制代码
 class PlainEncoder(nn.Module):

    
     def __init__(self, vocab_size, hidden_size, dropout=0.2):   # 假设embedding_size=hidden_size
    
     super(PlainEncoder, self).__init__()
    
     self.embed = nn.Embedding(vocab_size, hidden_size)
    
     self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True) # batch_first=True: [batch_size, seq_len, hidden_size]
    
     self.dropout = nn.Dropout(dropout)
    
     
    
     # x: 一个batch的每个句子的编码
    
     # lengths: 每个句子的原始编码长度(未补0的长度)
    
     # 最后一个hidden_state要取出来作为context vector,所以需要lengths
    
     def forward(self, x, lengths):    
    
     # (排序好元素,排序好元素下标)
    
     sorted_len, sorted_idx = lengths.sort(0, descending=True)  # 把batch里的seq按照长度降序排列 
    
     x_sorted = x[sorted_idx.long()]
    
     embedded = self.dropout(self.embed(x_sorted))
    
     
    
     # 句子padding到一样长度的(真实句长会比padding的短)
    
     # 为了rnn时能取到真实长度的最后状态,先pack_padded_sequence进行处理
    
     packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(),
    
                                                         batch_first=True)   
    
     # out:[batch, seq_len, hidden_zize]
    
     # hidden: [num_layers=1, batch, hidden_size]
    
     packed_out, hidden = self.rnn(packed_embedded)
    
     out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)  # 回到padding长度
    
     
    
     _, original_idx = sorted_idx.sort(0, descending=False)                  # 排序回原来的样子
    
     
    
     out = out[original_idx.long()].contiguous()            # [batch_size, seq_len, hidden_size]
    
     hidden = hidden[:, original_idx.long()].contiguous()   # [num_layers, batch_size, hidden_size]
    
     
    
 #         print("out.shape: ", out.shape, 'hidden.shape: ', hidden.shape)
    
     
    
     return out, hidden[[-1]]  # hidden[[-1]], 相当于out[:, -1]

测试一下:(可注释掉)

复制代码
 # 测试维度

    
 p = PlainEncoder(en_total_words, 100)
    
  
    
 mb_x = torch.from_numpy(train_data[0][0]).long()
    
 mb_x_len = torch.from_numpy(train_data[0][1]).long()
    
 print("数据集:", mb_x.shape, mb_x_len.shape)
    
  
    
 o, h = p(mb_x, mb_x_len)
    
  
    
 print(o.shape, h.shape)
    
 print(o[:, -1].shape, '\n', o[:, -1] == h)

数据集: torch.Size([64, 14]) torch.Size([64])
out.shape: torch.Size([64, 14, 100]) hidden.shape: torch.Size([1, 64, 100])
torch.Size([64, 14, 100]) torch.Size([1, 64, 100])
torch.Size([64, 100])
tensor([[[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
...,
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True],
[True, True, True, ..., True, True, True]]])

2.3 Decoder部分

Decoder会根据已经翻译的句子内容和context vectors,来决定下一个输出的单词;

复制代码
 class PlainDecoder(nn.Module):

    
     def __init__(self, vocab_size, hidden_size, dropout=0.2):
    
     super(PlainDecoder, self).__init__()
    
     self.embed = nn.Embedding(vocab_size, hidden_size)
    
     self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)  # [batch_size, seq_len, hidden_size]
    
     self.fc = nn.Linear(hidden_size, vocab_size)
    
     self.dropout = nn.Dropout(dropout)
    
     
    
     # 和PlainEncoder的forward过程大致差不多,区别在于hidden_state不是0而是传入的
    
     # y:一个batch的每个中文句子编码
    
     # hid: hidden_state, context vectors
    
     def forward(self, y, y_lengths, hid):
    
     sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
    
     y_sorted = y[sorted_idx.long()]
    
     hid = hid[:, sorted_idx.long()]
    
     
    
     # [batch_size, y_lengths, embed_size=hidden_size]
    
     y_sorted = self.dropout(self.embed(y_sorted))
    
     
    
     packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(),
    
                                                batch_first=True)
    
     out, hid = self.rnn(packed_seq, hid)
    
     unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
    
     
    
     _, original_idx = sorted_idx.sort(0, descending=False)  # 原来的索引升序
    
     output_seq = unpacked[original_idx.long()].contiguous() # [batch_size, y_length, hidden_size]
    
     hid = hid[:, original_idx.long()].contiguous()          # [1, batch_size, hidden_size]
    
     
    
     output = F.log_softmax(self.fc(output_seq), -1)         # [batch_size, y_lengths, vocab_size]
    
     
    
     return output, hid

2.4 构建Seq2Seq模型

构建Seq2Seq模型把encoder, attention, decoder串到一起;

复制代码
 class PlainSeq2Seq(nn.Module):

    
     def __init__(self, encoder, decoder):
    
     super(PlainSeq2Seq, self).__init__()
    
     self.encoder = encoder 
    
     self.decoder = decoder
    
     
    
     def forward(self, x, x_lengths, y, y_lengths):
    
     encoder_cut, hid = self.encoder(x, x_lengths)
    
     output, hid = self.decoder(y, y_lengths, hid)
    
         
    
     return output, None
    
     
    
     
    
     def translate(self, x, x_lengths, y, max_length=10):
    
     encoder_cut, hid = self.encoder(x, x_lengths)
    
     preds = []
    
     batch_size = x.shape[0]
    
     attns = []
    
     # sample
    
     for i in range(max_length):
    
         # output: [batch_size, y_lengths, vocab_size]
    
         # 训练的时候y是一个句子,一起decoder训练
    
         # 测试的时候y是个一个词一个词生成的,所以这里的y是传入的第一个单词,这里是bos
    
         # 同理y_lengths也是1
    
         output, hid = self.decoder(y=y, y_lengths=torch.ones(batch_size).long().to(device), 
    
                                    hid=hid)
    
         #刚开始循环bos作为模型的首个输入单词,后续更新y,下个预测单词的输入是上个输出单词
    
         # output.shape = torch.Size([1, 1, 3195])
    
         # hid.shape = torch.Size([1, 1, 100])
    
  
    
         y = output.max(2)[1].view(batch_size, 1) 
    
         # .max(2)在第三个维度上取最大值,返回最大值和对应的位置索引,[1]取出最大值所在的索引
    
         preds.append(y)
    
         # preds = [tensor([[5]], device='cuda:0'), tensor([[24]], device='cuda:0'), ... tensor([[4]], device='cuda:0')]
    
     
    
     # torch.cat(preds, 1) = tensor([[ 5, 24,  6, 22,  7,  4,  3,  4,  3,  4]], device='cuda:0')
    
     return torch.cat(preds, 1), None

3. 创建模型

定义模型、损失、优化器。

复制代码
 dropout = 0.2

    
 hidden_size = 100
    
 encode = PlainEncoder(vocab_size=en_total_words, hidden_size=hidden_size, dropout=dropout)
    
 decoder = PlainDecoder(vocab_size=cn_total_words, hidden_size=hidden_size, dropout=dropout)
    
  
    
 model = PlainSeq2Seq(encode, decoder)
    
 model = model.to(device)
    
  
    
 loss_fn = LanguageModelCriterion().to(device)
    
 optimizer = torch.optim.Adam(model.parameters())

4. 训练模型

复制代码
 def train(model, data, num_epochs=20):

    
     for epoch in range(num_epochs):
    
     model.train()      # 训练模式
    
     total_num_words = total_loss = 0.
    
     for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
    
         mb_x = torch.from_numpy(mb_x).to(device).long()
    
         mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
    
         
    
         mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()  # EOS之前
    
         mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()  # BOS之后
    
         
    
         mb_y_len = torch.from_numpy(mb_y_len - 1).to(device).long()
    
         mb_y_len[mb_y_len <= 0] = 1
    
         
    
         mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
    
         
    
         # [mb_y_len.max()]->[1, mb_y_len.max()]
    
         mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
    
         mb_out_mask = mb_out_mask.float()
    
         
    
         # (pre, target, mask)
    
         # mb_output是句子单词的索引
    
         loss = loss_fn(mb_pred, mb_output, mb_out_mask)
    
         
    
         num_words = torch.sum(mb_y_len).item()
    
         total_loss += loss.item() * num_words
    
         total_num_words += num_words
    
         
    
         # 更新模型
    
         optimizer.zero_grad()
    
         loss.backward()
    
         torch.nn.utils.clip_grad_norm_(model.parameters(), 5.)
    
         optimizer.step()
    
         
    
         if it % 100 == 0:
    
             print("Epoch: ", epoch, 'iteration', it, 'loss:', loss.item())
    
         
    
     
    
     print("Epoch", epoch, "Training loss", total_loss / total_num_words)
    
     
    
     if epoch % 5 == 0:
    
         evaluate(model, dev_data)
    
     
    
     torch.save(model.state_dict(), 'translate_model.pt')

5. 评估模型

复制代码
 def evaluate(model, data):

    
     model.eval()
    
     total_num_words = total_loss = 0.
    
     
    
     with torch.no_grad():
    
     
    
     for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(data):
    
         mb_x = torch.from_numpy(mb_x).to(device).long()
    
         mb_x_len = torch.from_numpy(mb_x_len).to(device).long()
    
         mb_input = torch.from_numpy(mb_y[:, :-1]).to(device).long()
    
         mb_output = torch.from_numpy(mb_y[:, 1:]).to(device).long()
    
         mb_y_len = torch.from_numpy(mb_y_len-1).to(device).long()
    
         mb_y_len[mb_y_len<=0] = 1
    
  
    
         mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
    
  
    
         mb_out_mask = torch.arange(mb_y_len.max().item(), device=device)[None, :] < mb_y_len[:, None]
    
         mb_out_mask = mb_out_mask.float()
    
  
    
         loss = loss_fn(mb_pred, mb_output, mb_out_mask)
    
  
    
         num_words = torch.sum(mb_y_len).item()
    
         total_loss += loss.item() * num_words
    
         total_num_words += num_words
    
         
    
     print("Evaluation loss", total_loss / total_num_words)

训练100次:

复制代码
    train(model, train_data, num_epochs=100)

训练结果(training loss在不断下降):

复制代码
 Epoch:  0 iteration 0 loss: 3.3647029399871826

    
 Epoch:  0 iteration 100 loss: 3.009509563446045
    
 Epoch:  0 iteration 200 loss: 3.782735824584961
    
 Epoch 0 Training loss 3.1535905243275186
    
 Evaluation loss 3.3979927680761692
    
 Epoch:  1 iteration 0 loss: 3.3019187450408936
    
 Epoch:  1 iteration 100 loss: 2.9146101474761963
    
 Epoch:  1 iteration 200 loss: 3.7248971462249756
    
 Epoch 1 Training loss 3.0795154243968996
    
 Epoch:  2 iteration 0 loss: 3.204010009765625
    
 Epoch:  2 iteration 100 loss: 2.863368511199951
    
 Epoch:  2 iteration 200 loss: 3.6527459621429443
    
 Epoch 2 Training loss 3.0103434118084182
    
 Epoch:  3 iteration 0 loss: 3.146893262863159
    
 Epoch:  3 iteration 100 loss: 2.759276866912842
    
 Epoch:  3 iteration 200 loss: 3.589343309402466
    
 Epoch 3 Training loss 2.9467000284848877
    
 Epoch:  4 iteration 0 loss: 3.1050117015838623
    
 Epoch:  4 iteration 100 loss: 2.708840847015381
    
 Epoch:  4 iteration 200 loss: 3.5071861743927
    
 Epoch 4 Training loss 2.8919197189025825
    
 Epoch:  5 iteration 0 loss: 3.0071966648101807
    
 Epoch:  5 iteration 100 loss: 2.6622238159179688
    
 Epoch:  5 iteration 200 loss: 3.464808225631714
    
 Epoch 5 Training loss 2.832557945455863
    
 Evaluation loss 3.2545772727449775
    
 Epoch:  6 iteration 0 loss: 2.967473268508911
    
 Epoch:  6 iteration 100 loss: 2.586355209350586
    
 Epoch:  6 iteration 200 loss: 3.467402696609497
    
 Epoch 6 Training loss 2.7854216275948933
    
 Epoch:  7 iteration 0 loss: 2.922556161880493
    
 Epoch:  7 iteration 100 loss: 2.5442593097686768
    
 Epoch:  7 iteration 200 loss: 3.402819871902466
    
 Epoch 7 Training loss 2.7393553376979582
    
 Epoch:  8 iteration 0 loss: 2.8680827617645264
    
 Epoch:  8 iteration 100 loss: 2.4990341663360596
    
 Epoch:  8 iteration 200 loss: 3.363720178604126
    
 Epoch 8 Training loss 2.6976078317344734
    
 Epoch:  9 iteration 0 loss: 2.7911880016326904
    
 Epoch:  9 iteration 100 loss: 2.4367892742156982
    
 Epoch:  9 iteration 200 loss: 3.3128461837768555
    
 Epoch 9 Training loss 2.655838535325863
    
 Epoch:  10 iteration 0 loss: 2.760638475418091
    
 Epoch:  10 iteration 100 loss: 2.388662338256836
    
 Epoch:  10 iteration 200 loss: 3.299316167831421
    
 Epoch 10 Training loss 2.6183036396412334
    
 Evaluation loss 3.179426688570673
    
 Epoch:  11 iteration 0 loss: 2.7541329860687256
    
 Epoch:  11 iteration 100 loss: 2.3711133003234863
    
 Epoch:  11 iteration 200 loss: 3.2783377170562744
    
 Epoch 11 Training loss 2.5806847991577992
    
 Epoch:  12 iteration 0 loss: 2.672988176345825
    
 Epoch:  12 iteration 100 loss: 2.376006841659546
    
 Epoch:  12 iteration 200 loss: 3.1972506046295166
    
 Epoch 12 Training loss 2.5446970471612693
    
 Epoch:  13 iteration 0 loss: 2.6494789123535156
    
 Epoch:  13 iteration 100 loss: 2.3170242309570312
    
 Epoch:  13 iteration 200 loss: 3.1941475868225098
    
 Epoch 13 Training loss 2.5119990739174747
    
 Epoch:  14 iteration 0 loss: 2.5805208683013916
    
 Epoch:  14 iteration 100 loss: 2.287121057510376
    
 Epoch:  14 iteration 200 loss: 3.15193247795105
    
 Epoch 14 Training loss 2.479404618952507
    
 Epoch:  15 iteration 0 loss: 2.5561468601226807
    
 Epoch:  15 iteration 100 loss: 2.263378858566284
    
 Epoch:  15 iteration 200 loss: 3.183692216873169
    
 Epoch 15 Training loss 2.4484731512219886
    
 Evaluation loss 3.1426560713748
    
 Epoch:  16 iteration 0 loss: 2.553135871887207
    
 Epoch:  16 iteration 100 loss: 2.2017245292663574
    
 Epoch:  16 iteration 200 loss: 3.1033968925476074
    
 Epoch 16 Training loss 2.422065194773223
    
 Epoch:  17 iteration 0 loss: 2.5503063201904297
    
 Epoch:  17 iteration 100 loss: 2.1875879764556885
    
 Epoch:  17 iteration 200 loss: 3.0571794509887695
    
 Epoch 17 Training loss 2.392596175684612
    
 Epoch:  18 iteration 0 loss: 2.447784900665283
    
 Epoch:  18 iteration 100 loss: 2.146362781524658
    
 Epoch:  18 iteration 200 loss: 3.064692974090576
    
 Epoch 18 Training loss 2.3654149344515334
    
 Epoch:  19 iteration 0 loss: 2.4578680992126465
    
 Epoch:  19 iteration 100 loss: 2.1460280418395996
    
 Epoch:  19 iteration 200 loss: 3.024839162826538
    
 Epoch 19 Training loss 2.3424499056425168
    
 Epoch:  20 iteration 0 loss: 2.4384076595306396
    
 Epoch:  20 iteration 100 loss: 2.0974316596984863
    
 Epoch:  20 iteration 200 loss: 2.9965004920959473
    
 Epoch 20 Training loss 2.3167023499073878
    
 Evaluation loss 3.1197055689269915
    
 Epoch:  21 iteration 0 loss: 2.3817431926727295
    
 Epoch:  21 iteration 100 loss: 2.0880067348480225
    
 Epoch:  21 iteration 200 loss: 2.9751596450805664
    
 Epoch 21 Training loss 2.290719437303847
    
 Epoch:  22 iteration 0 loss: 2.3944735527038574
    
 Epoch:  22 iteration 100 loss: 2.0802524089813232
    
 Epoch:  22 iteration 200 loss: 2.9455509185791016
    
 Epoch 22 Training loss 2.2698037450677613
    
 Epoch:  23 iteration 0 loss: 2.3046939373016357
    
 Epoch:  23 iteration 100 loss: 2.068814992904663
    
 Epoch:  23 iteration 200 loss: 2.9671618938446045
    
 Epoch 23 Training loss 2.2478544365587227
    
 Epoch:  24 iteration 0 loss: 2.2910232543945312
    
 Epoch:  24 iteration 100 loss: 2.0361578464508057
    
 Epoch:  24 iteration 200 loss: 2.912736177444458
    
 Epoch 24 Training loss 2.2235630649205875
    
 Epoch:  25 iteration 0 loss: 2.335442304611206
    
 Epoch:  25 iteration 100 loss: 2.0128493309020996
    
 Epoch:  25 iteration 200 loss: 2.902696132659912
    
 Epoch 25 Training loss 2.2045435398182813
    
 Evaluation loss 3.1087384036663863
    
 Epoch:  26 iteration 0 loss: 2.257906913757324
    
 Epoch:  26 iteration 100 loss: 1.9572561979293823
    
 Epoch:  26 iteration 200 loss: 2.8583080768585205
    
 Epoch 26 Training loss 2.1859489336062077
    
 Epoch:  27 iteration 0 loss: 2.240891933441162
    
 Epoch:  27 iteration 100 loss: 1.9300264120101929
    
 Epoch:  27 iteration 200 loss: 2.8508572578430176
    
 Epoch 27 Training loss 2.1693027983038515
    
 Epoch:  28 iteration 0 loss: 2.199796199798584
    
 Epoch:  28 iteration 100 loss: 1.9422686100006104
    
 Epoch:  28 iteration 200 loss: 2.842454195022583
    
 Epoch 28 Training loss 2.1484814160984214
    
 Epoch:  29 iteration 0 loss: 2.1854031085968018
    
 Epoch:  29 iteration 100 loss: 1.9529454708099365
    
 Epoch:  29 iteration 200 loss: 2.848923444747925
    
 Epoch 29 Training loss 2.129414516738762
    
 Epoch:  30 iteration 0 loss: 2.1895618438720703
    
 Epoch:  30 iteration 100 loss: 1.871588110923767
    
 Epoch:  30 iteration 200 loss: 2.791942834854126
    
 Epoch 30 Training loss 2.113142051178803
    
 Evaluation loss 3.1089972194763527
    
 Epoch:  31 iteration 0 loss: 2.183242082595825
    
 Epoch:  31 iteration 100 loss: 1.8810741901397705
    
 Epoch:  31 iteration 200 loss: 2.779383897781372
    
 Epoch 31 Training loss 2.095987657767845
    
 Epoch:  32 iteration 0 loss: 2.0996744632720947
    
 Epoch:  32 iteration 100 loss: 1.8364850282669067
    
 Epoch:  32 iteration 200 loss: 2.7766530513763428
    
 Epoch 32 Training loss 2.077641033989847
    
 Epoch:  33 iteration 0 loss: 2.1275956630706787
    
 Epoch:  33 iteration 100 loss: 1.8858064413070679
    
 Epoch:  33 iteration 200 loss: 2.7581260204315186
    
 Epoch 33 Training loss 2.060825001092984
    
 Epoch:  34 iteration 0 loss: 2.0973703861236572
    
 Epoch:  34 iteration 100 loss: 1.851388692855835
    
 Epoch:  34 iteration 200 loss: 2.7524964809417725
    
 Epoch 34 Training loss 2.0462104783610435
    
 Epoch:  35 iteration 0 loss: 2.086354970932007
    
 Epoch:  35 iteration 100 loss: 1.8358268737792969
    
 Epoch:  35 iteration 200 loss: 2.731438398361206
    
 Epoch 35 Training loss 2.0299077402768404
    
 Evaluation loss 3.1139209169721624
    
 Epoch:  36 iteration 0 loss: 2.0591766834259033
    
 Epoch:  36 iteration 100 loss: 1.831368088722229
    
 Epoch:  36 iteration 200 loss: 2.6570539474487305
    
 Epoch 36 Training loss 2.014671925172371
    
 Epoch:  37 iteration 0 loss: 2.035496234893799
    
 Epoch:  37 iteration 100 loss: 1.8156630992889404
    
 Epoch:  37 iteration 200 loss: 2.700183391571045
    
 Epoch 37 Training loss 2.00206255805924
    
 Epoch:  38 iteration 0 loss: 2.036298990249634
    
 Epoch:  38 iteration 100 loss: 1.7919279336929321
    
 Epoch:  38 iteration 200 loss: 2.638498306274414
    
 Epoch 38 Training loss 1.983478224500046
    
 Epoch:  39 iteration 0 loss: 2.0249581336975098
    
 Epoch:  39 iteration 100 loss: 1.7389947175979614
    
 Epoch:  39 iteration 200 loss: 2.7169861793518066
    
 Epoch 39 Training loss 1.9724427386659686
    
 Epoch:  40 iteration 0 loss: 2.0175204277038574
    
 Epoch:  40 iteration 100 loss: 1.7219321727752686
    
 Epoch:  40 iteration 200 loss: 2.6475744247436523
    
 Epoch 40 Training loss 1.9562676721658385
    
 Evaluation loss 3.1181668797161364
    
 Epoch:  41 iteration 0 loss: 2.006847620010376
    
 Epoch:  41 iteration 100 loss: 1.7191071510314941
    
 Epoch:  41 iteration 200 loss: 2.6677799224853516
    
 Epoch 41 Training loss 1.9437097878349063
    
 Epoch:  42 iteration 0 loss: 1.9333022832870483
    
 Epoch:  42 iteration 100 loss: 1.7141562700271606
    
 Epoch:  42 iteration 200 loss: 2.5984952449798584
    
 Epoch 42 Training loss 1.9283085355908671
    
 Epoch:  43 iteration 0 loss: 1.9463298320770264
    
 Epoch:  43 iteration 100 loss: 1.717552900314331
    
 Epoch:  43 iteration 200 loss: 2.612987518310547
    
 Epoch 43 Training loss 1.9148052832706421
    
 Epoch:  44 iteration 0 loss: 1.9681422710418701
    
 Epoch:  44 iteration 100 loss: 1.7166101932525635
    
 Epoch:  44 iteration 200 loss: 2.593944549560547
    
 Epoch 44 Training loss 1.9044130284488674
    
 Epoch:  45 iteration 0 loss: 1.9368000030517578
    
 Epoch:  45 iteration 100 loss: 1.658645749092102
    
 Epoch:  45 iteration 200 loss: 2.593125581741333
    
 Epoch 45 Training loss 1.8893168467190844
    
 Evaluation loss 3.1277268276045214
    
 Epoch:  46 iteration 0 loss: 1.8545007705688477
    
 Epoch:  46 iteration 100 loss: 1.6403976678848267
    
 Epoch:  46 iteration 200 loss: 2.5595622062683105
    
 Epoch 46 Training loss 1.8757247360021512
    
 Epoch:  47 iteration 0 loss: 1.883792519569397
    
 Epoch:  47 iteration 100 loss: 1.6655203104019165
    
 Epoch:  47 iteration 200 loss: 2.551154851913452
    
 Epoch 47 Training loss 1.868178638252467
    
 Epoch:  48 iteration 0 loss: 1.8451733589172363
    
 Epoch:  48 iteration 100 loss: 1.6777702569961548
    
 Epoch:  48 iteration 200 loss: 2.501884937286377
    
 Epoch 48 Training loss 1.8518471154006044
    
 Epoch:  49 iteration 0 loss: 1.8499925136566162
    
 Epoch:  49 iteration 100 loss: 1.6486607789993286
    
 Epoch:  49 iteration 200 loss: 2.524087429046631
    
 Epoch 49 Training loss 1.8454946782718415
    
 Epoch:  50 iteration 0 loss: 1.856377363204956
    
 Epoch:  50 iteration 100 loss: 1.6574885845184326
    
 Epoch:  50 iteration 200 loss: 2.501849412918091
    
 Epoch 50 Training loss 1.8342453327073283
    
 Evaluation loss 3.1381525688403076
    
 Epoch:  51 iteration 0 loss: 1.8513492345809937
    
 Epoch:  51 iteration 100 loss: 1.6156225204467773
    
 Epoch:  51 iteration 200 loss: 2.546480178833008
    
 Epoch 51 Training loss 1.8206363293651437
    
 Epoch:  52 iteration 0 loss: 1.826798915863037
    
 Epoch:  52 iteration 100 loss: 1.5861092805862427
    
 Epoch:  52 iteration 200 loss: 2.486717462539673
    
 Epoch 52 Training loss 1.8091440575272268
    
 Epoch:  53 iteration 0 loss: 1.7943329811096191
    
 Epoch:  53 iteration 100 loss: 1.599743366241455
    
 Epoch:  53 iteration 200 loss: 2.4579596519470215
    
 Epoch 53 Training loss 1.7989700911108664
    
 Epoch:  54 iteration 0 loss: 1.7656499147415161
    
 Epoch:  54 iteration 100 loss: 1.5951091051101685
    
 Epoch:  54 iteration 200 loss: 2.4595048427581787
    
 Epoch 54 Training loss 1.7877836588768
    
 Epoch:  55 iteration 0 loss: 1.7756575345993042
    
 Epoch:  55 iteration 100 loss: 1.5770317316055298
    
 Epoch:  55 iteration 200 loss: 2.4162347316741943
    
 Epoch 55 Training loss 1.7794164511320347
    
 Evaluation loss 3.1487013315196815
    
 Epoch:  56 iteration 0 loss: 1.754793643951416
    
 Epoch:  56 iteration 100 loss: 1.546436071395874
    
 Epoch:  56 iteration 200 loss: 2.4273550510406494
    
 Epoch 56 Training loss 1.7669288957699174
    
 Epoch:  57 iteration 0 loss: 1.7600376605987549
    
 Epoch:  57 iteration 100 loss: 1.4999576807022095
    
 Epoch:  57 iteration 200 loss: 2.439790725708008
    
 Epoch 57 Training loss 1.7579986667589775
    
 Epoch:  58 iteration 0 loss: 1.7710247039794922
    
 Epoch:  58 iteration 100 loss: 1.5441653728485107
    
 Epoch:  58 iteration 200 loss: 2.411104202270508
    
 Epoch 58 Training loss 1.749948290134124
    
 Epoch:  59 iteration 0 loss: 1.7791287899017334
    
 Epoch:  59 iteration 100 loss: 1.5441499948501587
    
 Epoch:  59 iteration 200 loss: 2.4272119998931885
    
 Epoch 59 Training loss 1.7376091327428274
    
 Epoch:  60 iteration 0 loss: 1.7641197443008423
    
 Epoch:  60 iteration 100 loss: 1.505827784538269
    
 Epoch:  60 iteration 200 loss: 2.4162049293518066
    
 Epoch 60 Training loss 1.729162069608205
    
 Evaluation loss 3.1680270844662357
    
 Epoch:  61 iteration 0 loss: 1.719151258468628
    
 Epoch:  61 iteration 100 loss: 1.500209927558899
    
 Epoch:  61 iteration 200 loss: 2.4351766109466553
    
 Epoch 61 Training loss 1.7190746620618302
    
 Epoch:  62 iteration 0 loss: 1.7070326805114746
    
 Epoch:  62 iteration 100 loss: 1.50221848487854
    
 Epoch:  62 iteration 200 loss: 2.399951457977295
    
 Epoch 62 Training loss 1.707298602424269
    
 Epoch:  63 iteration 0 loss: 1.6960980892181396
    
 Epoch:  63 iteration 100 loss: 1.4736263751983643
    
 Epoch:  63 iteration 200 loss: 2.3375589847564697
    
 Epoch 63 Training loss 1.7027722406700785
    
 Epoch:  64 iteration 0 loss: 1.6605229377746582
    
 Epoch:  64 iteration 100 loss: 1.496120572090149
    
 Epoch:  64 iteration 200 loss: 2.377760887145996
    
 Epoch 64 Training loss 1.6901847218926664
    
 Epoch:  65 iteration 0 loss: 1.7002284526824951
    
 Epoch:  65 iteration 100 loss: 1.463133454322815
    
 Epoch:  65 iteration 200 loss: 2.377936601638794
    
 Epoch 65 Training loss 1.6831096865487802
    
 Evaluation loss 3.177895229637778
    
 Epoch:  66 iteration 0 loss: 1.6268677711486816
    
 Epoch:  66 iteration 100 loss: 1.5310866832733154
    
 Epoch:  66 iteration 200 loss: 2.3395535945892334
    
 Epoch 66 Training loss 1.6750581275368728
    
 Epoch:  67 iteration 0 loss: 1.683242678642273
    
 Epoch:  67 iteration 100 loss: 1.4536606073379517
    
 Epoch:  67 iteration 200 loss: 2.33609938621521
    
 Epoch 67 Training loss 1.6638375889732597
    
 Epoch:  68 iteration 0 loss: 1.6539921760559082
    
 Epoch:  68 iteration 100 loss: 1.4477794170379639
    
 Epoch:  68 iteration 200 loss: 2.3414015769958496
    
 Epoch 68 Training loss 1.6606883198725237
    
 Epoch:  69 iteration 0 loss: 1.6292625665664673
    
 Epoch:  69 iteration 100 loss: 1.404828667640686
    
 Epoch:  69 iteration 200 loss: 2.321927547454834
    
 Epoch 69 Training loss 1.6506938973182488
    
 Epoch:  70 iteration 0 loss: 1.6185498237609863
    
 Epoch:  70 iteration 100 loss: 1.4216632843017578
    
 Epoch:  70 iteration 200 loss: 2.3253204822540283
    
 Epoch 70 Training loss 1.6387621088477575
    
 Evaluation loss 3.1902488400655886
    
 Epoch:  71 iteration 0 loss: 1.6030402183532715
    
 Epoch:  71 iteration 100 loss: 1.4137858152389526
    
 Epoch:  71 iteration 200 loss: 2.3256776332855225
    
 Epoch 71 Training loss 1.6318460844078808
    
 Epoch:  72 iteration 0 loss: 1.6068423986434937
    
 Epoch:  72 iteration 100 loss: 1.4504164457321167
    
 Epoch:  72 iteration 200 loss: 2.3437039852142334
    
 Epoch 72 Training loss 1.6246998589395558
    
 Epoch:  73 iteration 0 loss: 1.5764877796173096
    
 Epoch:  73 iteration 100 loss: 1.3730628490447998
    
 Epoch:  73 iteration 200 loss: 2.264051675796509
    
 Epoch 73 Training loss 1.6186856142415567
    
 Epoch:  74 iteration 0 loss: 1.5833429098129272
    
 Epoch:  74 iteration 100 loss: 1.381920576095581
    
 Epoch:  74 iteration 200 loss: 2.2876336574554443
    
 Epoch 74 Training loss 1.6106610198597258
    
 Epoch:  75 iteration 0 loss: 1.5880494117736816
    
 Epoch:  75 iteration 100 loss: 1.4044418334960938
    
 Epoch:  75 iteration 200 loss: 2.2574541568756104
    
 Epoch 75 Training loss 1.5998829403443475
    
 Evaluation loss 3.205575323503987
    
 Epoch:  76 iteration 0 loss: 1.5913504362106323
    
 Epoch:  76 iteration 100 loss: 1.3733941316604614
    
 Epoch:  76 iteration 200 loss: 2.273179292678833
    
 Epoch 76 Training loss 1.5944278182877876
    
 Epoch:  77 iteration 0 loss: 1.574967861175537
    
 Epoch:  77 iteration 100 loss: 1.4105134010314941
    
 Epoch:  77 iteration 200 loss: 2.260707139968872
    
 Epoch 77 Training loss 1.5890476528108952
    
 Epoch:  78 iteration 0 loss: 1.5877436399459839
    
 Epoch:  78 iteration 100 loss: 1.3723187446594238
    
 Epoch:  78 iteration 200 loss: 2.266782760620117
    
 Epoch 78 Training loss 1.580453802036902
    
 Epoch:  79 iteration 0 loss: 1.540144920349121
    
 Epoch:  79 iteration 100 loss: 1.370208978652954
    
 Epoch:  79 iteration 200 loss: 2.2479166984558105
    
 Epoch 79 Training loss 1.5723614631359557
    
 Epoch:  80 iteration 0 loss: 1.5240201950073242
    
 Epoch:  80 iteration 100 loss: 1.3667224645614624
    
 Epoch:  80 iteration 200 loss: 2.2798657417297363
    
 Epoch 80 Training loss 1.5671947631266923
    
 Evaluation loss 3.2182803124543784
    
 Epoch:  81 iteration 0 loss: 1.5349093675613403
    
 Epoch:  81 iteration 100 loss: 1.341757893562317
    
 Epoch:  81 iteration 200 loss: 2.2628333568573
    
 Epoch 81 Training loss 1.5582374857442876
    
 Epoch:  82 iteration 0 loss: 1.4877135753631592
    
 Epoch:  82 iteration 100 loss: 1.3469762802124023
    
 Epoch:  82 iteration 200 loss: 2.2514214515686035
    
 Epoch 82 Training loss 1.5549645483978292
    
 Epoch:  83 iteration 0 loss: 1.5119167566299438
    
 Epoch:  83 iteration 100 loss: 1.3386821746826172
    
 Epoch:  83 iteration 200 loss: 2.2184598445892334
    
 Epoch 83 Training loss 1.546844436348798
    
 Epoch:  84 iteration 0 loss: 1.4820687770843506
    
 Epoch:  84 iteration 100 loss: 1.3448508977890015
    
 Epoch:  84 iteration 200 loss: 2.199396848678589
    
 Epoch 84 Training loss 1.5380232074195026
    
 Epoch:  85 iteration 0 loss: 1.4752027988433838
    
 Epoch:  85 iteration 100 loss: 1.316656231880188
    
 Epoch:  85 iteration 200 loss: 2.228752374649048
    
 Epoch 85 Training loss 1.52975351648403
    
 Evaluation loss 3.2336413650535087
    
 Epoch:  86 iteration 0 loss: 1.499496340751648
    
 Epoch:  86 iteration 100 loss: 1.3332045078277588
    
 Epoch:  86 iteration 200 loss: 2.2489013671875
    
 Epoch 86 Training loss 1.5249615564712846
    
 Epoch:  87 iteration 0 loss: 1.50925874710083
    
 Epoch:  87 iteration 100 loss: 1.3083447217941284
    
 Epoch:  87 iteration 200 loss: 2.235308885574341
    
 Epoch 87 Training loss 1.5197892824018502
    
 Epoch:  88 iteration 0 loss: 1.4814422130584717
    
 Epoch:  88 iteration 100 loss: 1.3245668411254883
    
 Epoch:  88 iteration 200 loss: 2.193997859954834
    
 Epoch 88 Training loss 1.5135974575387956
    
 Epoch:  89 iteration 0 loss: 1.4810220003128052
    
 Epoch:  89 iteration 100 loss: 1.2921677827835083
    
 Epoch:  89 iteration 200 loss: 2.1645917892456055
    
 Epoch 89 Training loss 1.5075664417517958
    
 Epoch:  90 iteration 0 loss: 1.4697095155715942
    
 Epoch:  90 iteration 100 loss: 1.2751893997192383
    
 Epoch:  90 iteration 200 loss: 2.188906669616699
    
 Epoch 90 Training loss 1.5008888401218585
    
 Evaluation loss 3.2456318169295293
    
 Epoch:  91 iteration 0 loss: 1.4636540412902832
    
 Epoch:  91 iteration 100 loss: 1.3394463062286377
    
 Epoch:  91 iteration 200 loss: 2.192689895629883
    
 Epoch 91 Training loss 1.4943399774943313
    
 Epoch:  92 iteration 0 loss: 1.4552161693572998
    
 Epoch:  92 iteration 100 loss: 1.2322344779968262
    
 Epoch:  92 iteration 200 loss: 2.1635537147521973
    
 Epoch 92 Training loss 1.488440135669707
    
 Epoch:  93 iteration 0 loss: 1.4642064571380615
    
 Epoch:  93 iteration 100 loss: 1.2490650415420532
    
 Epoch:  93 iteration 200 loss: 2.137782573699951
    
 Epoch 93 Training loss 1.4828345331954083
    
 Epoch:  94 iteration 0 loss: 1.425548791885376
    
 Epoch:  94 iteration 100 loss: 1.2757179737091064
    
 Epoch:  94 iteration 200 loss: 2.1594502925872803
    
 Epoch 94 Training loss 1.47362902414513
    
 Epoch:  95 iteration 0 loss: 1.4208916425704956
    
 Epoch:  95 iteration 100 loss: 1.260089635848999
    
 Epoch:  95 iteration 200 loss: 2.1245341300964355
    
 Epoch 95 Training loss 1.468862286276855
    
 Evaluation loss 3.265405671529478
    
 Epoch:  96 iteration 0 loss: 1.413726568222046
    
 Epoch:  96 iteration 100 loss: 1.2730776071548462
    
 Epoch:  96 iteration 200 loss: 2.1034820079803467
    
 Epoch 96 Training loss 1.464572765902645
    
 Epoch:  97 iteration 0 loss: 1.3888133764266968
    
 Epoch:  97 iteration 100 loss: 1.29197096824646
    
 Epoch:  97 iteration 200 loss: 2.159865617752075
    
 Epoch 97 Training loss 1.4591572745032382
    
 Epoch:  98 iteration 0 loss: 1.3947553634643555
    
 Epoch:  98 iteration 100 loss: 1.271963119506836
    
 Epoch:  98 iteration 200 loss: 2.1502716541290283
    
 Epoch 98 Training loss 1.4532260618277022
    
 Epoch:  99 iteration 0 loss: 1.4218417406082153
    
 Epoch:  99 iteration 100 loss: 1.2315309047698975
    
 Epoch:  99 iteration 200 loss: 2.12766695022583
    
 Epoch 99 Training loss 1.4487215552807855

6. 翻译

复制代码
 def translate_dev(i):

    
     en_sent = " ".join([inv_en_dict[w] for w in dev_en[i]])  #原来的英文
    
     print(en_sent)
    
     cn_sent = " ".join([inv_cn_dict[w] for w in dev_cn[i]])  #原来的中文
    
     print("".join(cn_sent))
    
  
    
     # 一条句子
    
     mb_x = torch.from_numpy(np.array(dev_en[i]).reshape(1, -1)).long().to(device)
    
     mb_x_len = torch.from_numpy(np.array([len(dev_en[i])])).long().to(device)
    
     bos = torch.Tensor([[cn_dict["BOS"]]]).long().to(device)  # shape:[1,1], [[2]]
    
     
    
     # y_lengths: [[2]], 一个句子
    
     translation, attn = model.translate(mb_x, mb_x_len, bos)  # [1, 10]
    
     # 映射成中文
    
     translation = [inv_cn_dict[i] for i in translation.data.cpu().numpy().reshape(-1)]
    
     trans = []
    
     for word in translation:
    
     if word != "EOS":
    
         trans.append(word)
    
     else:
    
         break
    
     print("".join(trans))           #翻译后的中文
    
  
    
 # 导入训练好模型
    
 model.load_state_dict(torch.load('translate_model.pt', map_location=device))
    
 for i in range(100, 120):
    
     translate_dev(i)
    
     print()

执行结果:(样本少,且训练时间短,效果不好)

复制代码
 BOS you have nice skin . EOS

    
 BOS 你 的 皮 膚 真 好 。 EOS
    
 你只有一些蛋糕。
    
  
    
 BOS you 're UNK correct . EOS
    
 BOS 你 部 分 正 确 。 EOS
    
 你可以选择。
    
  
    
 BOS everyone admired his courage . EOS
    
 BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
    
 每個人都抨擊他的健康
    
  
    
 BOS what time is it ? EOS
    
 BOS 几 点 了 ? EOS
    
 那是什么?
    
  
    
 BOS i 'm free tonight . EOS
    
 BOS 我 今 晚 有 空 。 EOS
    
 我今晚有空。
    
  
    
 BOS here is your book . EOS
    
 BOS 這 是 你 的 書 。 EOS
    
 那是你的书。
    
  
    
 BOS they are at lunch . EOS
    
 BOS 他 们 在 吃 午 饭 。 EOS
    
 他們正在吃午飯。
    
  
    
 BOS this chair is UNK . EOS
    
 BOS 這 把 椅 子 很 UNK 。 EOS
    
 这本书非常兴奋。
    
  
    
 BOS it 's pretty heavy . EOS
    
 BOS 它 真 重 。 EOS
    
 它是最好的。
    
  
    
 BOS many attended his funeral . EOS
    
 BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
    
 在这个男人正在看他。
    
  
    
 BOS training will be provided . EOS
    
 BOS 会 有 训 练 。 EOS
    
 努力停為下雪停。
    
  
    
 BOS someone is watching you . EOS
    
 BOS 有 人 在 看 著 你 。 EOS
    
 有很多就了。
    
  
    
 BOS i slapped his face . EOS
    
 BOS 我 摑 了 他 的 臉 。 EOS
    
 我认为我的狗。
    
  
    
 BOS i like UNK music . EOS
    
 BOS 我 喜 歡 流 行 音 樂 。 EOS
    
 我喜欢音乐。
    
  
    
 BOS tom had no children . EOS
    
 BOS T o m 沒 有 孩 子 。 EOS
    
 她的父親沒有聽盲。
    
  
    
 BOS please lock the door . EOS
    
 BOS 請 把 門 鎖 上 。 EOS
    
 請關上門。
    
  
    
 BOS tom has calmed down . EOS
    
 BOS 汤 姆 冷 静 下 来 了 。 EOS
    
 汤姆坐在机器。
    
  
    
 BOS please speak more loudly . EOS
    
 BOS 請 說 大 聲 一 點 兒 。 EOS
    
 請說話再說話。
    
  
    
 BOS keep next sunday free . EOS
    
 BOS 把 下 周 日 空 出 来 。 EOS
    
 星星期天下雨。
    
  
    
 BOS i made a mistake . EOS
    
 BOS 我 犯 了 一 個 錯 。 EOS
    
 我一直成為一個演員。

7. Encoder Decoder模型(含Attention版本)

7.1 Encoder

Encoder模型的任务是把输入文字传入embedding层和GRU层,转换成一些hidden states作为后续的context vectors;

复制代码
 class Encoder(nn.Module):

    
     def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
    
     super(Encoder, self).__init__()
    
     self.embed = nn.Embedding(vocab_size, embed_size)
    
     self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first=True, bidirectional=True)
    
     self.dropout = nn.Dropout(dropout)
    
     self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)
    
  
    
     def forward(self, x, lengths):
    
     sorted_len, sorted_idx = lengths.sort(0, descending=True)
    
     x_sorted = x[sorted_idx.long()]
    
     embedded = self.dropout(self.embed(x_sorted))
    
     
    
     packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_len.long().cpu().data.numpy(), batch_first=True)
    
     packed_out, hid = self.rnn(packed_embedded)
    
     out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
    
     _, original_idx = sorted_idx.sort(0, descending=False)
    
     out = out[original_idx.long()].contiguous()
    
     hid = hid[:, original_idx.long()].contiguous()
    
     # hid: [2, batch_size, enc_hidden_size]
    
     
    
     hid = torch.cat([hid[-2], hid[-1]], dim=1) # 将最后一层的hid的双向拼接
    
     # hid: [batch_size, 2*enc_hidden_size]
    
     hid = torch.tanh(self.fc(hid)).unsqueeze(0)
    
     # hid: [1, batch_size, dec_hidden_size]
    
     # out: [batch_size, seq_len, 2*enc_hidden_size]
    
     return out, hid

7.2 Luong Attention

图中 h_t 是第t个step下GRU的输出,即output

\hat{h_s} 是encoder后的context,用QKV模型来解释的话,query就是 ,key是 ;

根据context vectors 和 当前的输出hidden states,计算输出;

复制代码
 class Attention(nn.Module):

    
     def __init__(self, enc_hidden_size, dec_hidden_size):
    
     # enc_hidden_size跟Encoder的一样
    
     super(Attention, self).__init__()
    
     self.enc_hidden_size = enc_hidden_size
    
     self.dec_hidden_size = dec_hidden_size
    
  
    
     self.linear_in = nn.Linear(enc_hidden_size*2, dec_hidden_size, bias=False)
    
     self.linear_out = nn.Linear(enc_hidden_size*2 + dec_hidden_size, dec_hidden_size)
    
     
    
     def forward(self, output, context, mask):
    
     # mask = batch_size, output_len, context_len     # mask在Decoder中创建好了
    
     # output: batch_size, output_len, dec_hidden_size,就是Decoder的output
    
     # context: batch_size, context_len, 2*enc_hidden_size,就是Encoder的output 
    
     # 这里Encoder网络是双向的,Decoder是单向的
    
     
    
     batch_size = output.size(0)
    
     output_len = output.size(1)
    
     input_len = context.size(1) # input_len = context_len
    
     
    
     # 通过decoder的hidden states加上encoder的hidden states来计算一个分数,用于计算权重
    
     # batch_size, context_len, dec_hidden_size
    
     # 第一步,公式里的Wa先与hs做点乘,把Encoder output的enc_hidden_size换成dec_hidden_size。
    
     # Q: W·context
    
     context_in = self.linear_in(context.view(batch_size*input_len, -1)).view(                
    
                                 batch_size, input_len, -1) 
    
     
    
     # Q·K
    
     # context_in.transpose(1,2): batch_size, dec_hidden_size, context_len 
    
     # output: batch_size, output_len, dec_hidden_size
    
     attn = torch.bmm(output, context_in.transpose(1,2)) 
    
     # batch_size, output_len, context_len
    
     # 第二步,ht与上一步结果点乘,得到score
    
  
    
     attn.data.masked_fill(mask, -1e6)
    
     # .masked_fill作用请看这个链接:
    
     # mask的维度必须和attn维度相同,mask为1的位置对应attn的位置的值替换成-1e6,
    
     # mask为1的意义需要看Decoder函数里面的定义
    
  
    
     attn = F.softmax(attn, dim=2) 
    
     # batch_size, output_len, context_len
    
     # 这个dim=2到底是怎么softmax的看下下面单元格例子
    
     # 第三步,计算每一个encoder的hidden states对应的权重。
    
     
    
     # context: batch_size, context_len, 2*enc_hidden_size,
    
     context = torch.bmm(attn, context) 
    
     # batch_size, output_len, 2*enc_hidden_size
    
     # 第四步,得出context vector是一个对于encoder输出的hidden states的一个加权平均
    
     
    
     # output: batch_size, output_len, dec_hidden_size
    
     output = torch.cat((context, output), dim=2) 
    
     # output:batch_size, output_len, 2*enc_hidden_size+dec_hidden_size
    
     # 第五步,将context vector和 decoder的hidden states 串起来。
    
     
    
     output = output.view(batch_size*output_len, -1)
    
     # output.shape = (batch_size*output_len, 2*enc_hidden_size+dec_hidden_size)
    
     output = torch.tanh(self.linear_out(output)) 
    
     # output.shape=(batch_size*output_len, dec_hidden_size)
    
     output = output.view(batch_size, output_len, -1)
    
     # output.shape=(batch_size, output_len, dec_hidden_size)
    
     # attn.shape = batch_size, output_len, context_len
    
     return output, attn

7.3 Decoder

Decoder会根据已经翻译的句子内容和context vectors,来决定下一个输出的单词;

复制代码
 class Decoder(nn.Module):

    
     def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout=0.2):
    
     super(Decoder, self).__init__()
    
     self.embed = nn.Embedding(vocab_size, embed_size)
    
     self.attention = Attention(enc_hidden_size, dec_hidden_size)
    
     self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
    
     self.out = nn.Linear(dec_hidden_size, vocab_size)
    
     self.dropout = nn.Dropout(dropout)
    
  
    
     def create_mask(self, x_len, y_len):
    
     # x_len 是一个batch中文句子的长度列表
    
     # y_len 是一个batch英文句子的长度列表
    
     # a mask of shape x_len * y_len
    
     device = x_len.device
    
     max_x_len = x_len.max()
    
     max_y_len = y_len.max()
    
     
    
     x_mask = torch.arange(max_x_len, device=device)[None, :] < x_len[:, None]
    
     # print(x_mask.shape) = (batch_size, output_len) # 中文句子的mask
    
     y_mask = torch.arange(max_y_len, device=device)[None, :] < y_len[:, None]
    
     # print(y_mask.shape) = (batch_size, context_len) # 英文句子的mask
    
     
    
     mask = ( ~ x_mask[:, :, None] * y_mask[:, None, :]).byte()
    
     # mask = (1 - x_mask[:, :, None] * y_mask[:, None, :]).byte()
    
     # 1-说明取反
    
     # x_mask[:, :, None] = (batch_size, output_len, 1)
    
     # y_mask[:, None, :] =  (batch_size, 1, context_len)
    
     # print(mask.shape) = (batch_size, output_len, context_len)
    
     # 注意这个例子的*相乘不是torch.bmm矩阵点乘,只是用到了广播机制而已。
    
     return mask
    
     
    
     def forward(self, encoder_out, x_lengths, y, y_lengths, hid):
    
     sorted_len, sorted_idx = y_lengths.sort(0, descending=True)
    
     y_sorted = y[sorted_idx.long()]
    
     hid = hid[:, sorted_idx.long()]
    
     
    
     y_sorted = self.dropout(self.embed(y_sorted)) # batch_size, output_length, embed_size
    
  
    
     packed_seq = nn.utils.rnn.pack_padded_sequence(y_sorted, sorted_len.long().cpu().data.numpy(), batch_first=True)
    
     out, hid = self.rnn(packed_seq, hid)
    
     unpacked, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
    
     _, original_idx = sorted_idx.sort(0, descending=False)
    
     output_seq = unpacked[original_idx.long()].contiguous()
    
     hid = hid[:, original_idx.long()].contiguous()
    
  
    
     mask = self.create_mask(y_lengths, x_lengths) # 这里真是坑,第一个参数位置是中文句子的长度列表
    
  
    
     output, attn = self.attention(output_seq, encoder_out, mask) 
    
     # output.shape=(batch_size, output_len, dec_hidden_size)
    
     # attn.shape = batch_size, output_len, context_len
    
     
    
     # self.out = nn.Linear(dec_hidden_size, vocab_size)
    
     output = F.log_softmax(self.out(output), -1) # 计算最后的输出概率
    
     # output =(batch_size, output_len, vocab_size)
    
     # 最后一个vocab_size维度 log_softmax
    
     # hid.shape = (1, batch_size, dec_hidden_size)
    
     return output, hid, attn

7.4 Seq2Seq

最后我们构建Seq2Seq模型把encoder, attention, decoder串到一起

复制代码
 class Seq2Seq(nn.Module):

    
     def __init__(self, encoder, decoder):
    
     super(Seq2Seq, self).__init__()
    
     self.encoder = encoder
    
     self.decoder = decoder
    
     
    
     def forward(self, x, x_lengths, y, y_lengths):
    
     encoder_out, hid = self.encoder(x, x_lengths)
    
     # print(hid.shape)=torch.Size([1, batch_size, dec_hidden_size])
    
     # print(out.shape)=torch.Size([batch_size, seq_len, 2*enc_hidden_size])
    
     output, hid, attn = self.decoder(encoder_out=encoder_out, 
    
                 x_lengths=x_lengths,
    
                 y=y,
    
                 y_lengths=y_lengths,
    
                 hid=hid)
    
     # output =(batch_size, output_len, vocab_size)
    
     # hid.shape = (1, batch_size, dec_hidden_size)
    
     # attn.shape = (batch_size, output_len, context_len)
    
     return output, attn
    
     
    
  
    
     def translate(self, x, x_lengths, y, max_length=100):
    
     encoder_out, hid = self.encoder(x, x_lengths)
    
     preds = []
    
     batch_size = x.shape[0]
    
     attns = []
    
     for i in range(max_length):
    
         output, hid, attn = self.decoder(encoder_out, 
    
                 x_lengths,
    
                 y,
    
                 torch.ones(batch_size).long().to(y.device),
    
                 hid)
    
         y = output.max(2)[1].view(batch_size, 1)
    
         preds.append(y)
    
         attns.append(attn)
    
         
    
     return torch.cat(preds, 1), torch.cat(attns, 1)

8. 训练函数并调用上面的train函数

复制代码
 dropout = 0.2

    
 embed_size = hidden_size = 100
    
 encoder = Encoder(vocab_size=en_total_words,
    
                 embed_size=embed_size,
    
                 enc_hidden_size=hidden_size,
    
                 dec_hidden_size=hidden_size,
    
                 dropout=dropout)
    
 decoder = Decoder(vocab_size=cn_total_words,
    
                 embed_size=embed_size,
    
                 enc_hidden_size=hidden_size,
    
                 dec_hidden_size=hidden_size,
    
                 dropout=dropout)
    
 model = Seq2Seq(encoder, decoder)
    
 model = model.to(device)
    
 loss_fn = LanguageModelCriterion().to(device)
    
 optimizer = torch.optim.Adam(model.parameters())
    
  
    
 train(model, train_data, num_epochs=100)
复制代码
 /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:25: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)

    
 Epoch 0 iteration 0 loss 8.077441215515137
    
 Epoch 0 iteration 100 loss 5.345982551574707
    
 Epoch 0 iteration 200 loss 4.56335973739624
    
 Epoch 0 Training loss 5.50921318013691
    
 Evaluation loss 5.080491080824646
    
 Epoch 1 iteration 0 loss 4.47300386428833
    
 Epoch 1 iteration 100 loss 4.909076690673828
    
 Epoch 1 iteration 200 loss 4.016790390014648
    
 Epoch 1 Training loss 4.876065829219002
    
 Epoch 2 iteration 0 loss 3.9774909019470215
    
 Epoch 2 iteration 100 loss 4.472506046295166
    
 Epoch 2 iteration 200 loss 3.612961530685425
    
 Epoch 2 Training loss 4.438564572733501
    
 Epoch 3 iteration 0 loss 3.582581043243408
    
 Epoch 3 iteration 100 loss 4.136115074157715
    
 Epoch 3 iteration 200 loss 3.3212907314300537
    
 Epoch 3 Training loss 4.112743628998822
    
 Epoch 4 iteration 0 loss 3.2368381023406982
    
 Epoch 4 iteration 100 loss 3.8409037590026855
    
 Epoch 4 iteration 200 loss 3.097996711730957
    
 Epoch 4 Training loss 3.8477170270406864
    
 Epoch 5 iteration 0 loss 3.0059776306152344
    
 Epoch 5 iteration 100 loss 3.6137866973876953
    
 Epoch 5 iteration 200 loss 2.8685319423675537
    
 Epoch 5 Training loss 3.62822900357212
    
 Evaluation loss 3.637867412334476
    
 Epoch 6 iteration 0 loss 2.742856979370117
    
 Epoch 6 iteration 100 loss 3.390110492706299
    
 Epoch 6 iteration 200 loss 2.6777687072753906
    
 Epoch 6 Training loss 3.438536527389024
    
 Epoch 7 iteration 0 loss 2.585566759109497
    
 Epoch 7 iteration 100 loss 3.237795352935791
    
 Epoch 7 iteration 200 loss 2.5204241275787354
    
 Epoch 7 Training loss 3.27654657979662
    
 Epoch 8 iteration 0 loss 2.4295897483825684
    
 Epoch 8 iteration 100 loss 3.1119232177734375
    
 Epoch 8 iteration 200 loss 2.3597609996795654
    
 Epoch 8 Training loss 3.134849904339776
    
 Epoch 9 iteration 0 loss 2.2652432918548584
    
 Epoch 9 iteration 100 loss 2.9519033432006836
    
 Epoch 9 iteration 200 loss 2.217094898223877
    
 Epoch 9 Training loss 3.0061632458155874
    
 Epoch 10 iteration 0 loss 2.1327497959136963
    
 Epoch 10 iteration 100 loss 2.851846694946289
    
 Epoch 10 iteration 200 loss 2.1458141803741455
    
 Epoch 10 Training loss 2.894426641793655
    
 Evaluation loss 3.166497583308483
    
 Epoch 11 iteration 0 loss 2.013716697692871
    
 Epoch 11 iteration 100 loss 2.7616653442382812
    
 Epoch 11 iteration 200 loss 2.0029869079589844
    
 Epoch 11 Training loss 2.791488365026667
    
 Epoch 12 iteration 0 loss 1.9475183486938477
    
 Epoch 12 iteration 100 loss 2.647017240524292
    
 Epoch 12 iteration 200 loss 1.909979224205017
    
 Epoch 12 Training loss 2.698569336892456
    
 Epoch 13 iteration 0 loss 1.823117733001709
    
 Epoch 13 iteration 100 loss 2.6043999195098877
    
 Epoch 13 iteration 200 loss 1.8382450342178345
    
 Epoch 13 Training loss 2.616960156850951
    
 Epoch 14 iteration 0 loss 1.7701350450515747
    
 Epoch 14 iteration 100 loss 2.528083086013794
    
 Epoch 14 iteration 200 loss 1.7523369789123535
    
 Epoch 14 Training loss 2.5364692366823496
    
 Epoch 15 iteration 0 loss 1.6475502252578735
    
 Epoch 15 iteration 100 loss 2.4581422805786133
    
 Epoch 15 iteration 200 loss 1.7099241018295288
    
 Epoch 15 Training loss 2.4666260303200516
    
 Evaluation loss 2.96595491125677
    
 Epoch 16 iteration 0 loss 1.5571707487106323
    
 Epoch 16 iteration 100 loss 2.3642022609710693
    
 Epoch 16 iteration 200 loss 1.6701610088348389
    
 Epoch 16 Training loss 2.3992404009048993
    
 Epoch 17 iteration 0 loss 1.5091164112091064
    
 Epoch 17 iteration 100 loss 2.3246700763702393
    
 Epoch 17 iteration 200 loss 1.5856270790100098
    
 Epoch 17 Training loss 2.3398954671301877
    
 Epoch 18 iteration 0 loss 1.4500510692596436
    
 Epoch 18 iteration 100 loss 2.3111109733581543
    
 Epoch 18 iteration 200 loss 1.5008033514022827
    
 Epoch 18 Training loss 2.2817300454663068
    
 Epoch 19 iteration 0 loss 1.3648465871810913
    
 Epoch 19 iteration 100 loss 2.2263357639312744
    
 Epoch 19 iteration 200 loss 1.434478521347046
    
 Epoch 19 Training loss 2.2250880660919448
    
 Epoch 20 iteration 0 loss 1.29836106300354
    
 Epoch 20 iteration 100 loss 2.170522928237915
    
 Epoch 20 iteration 200 loss 1.413167119026184
    
 Epoch 20 Training loss 2.174868286439991
    
 Evaluation loss 2.862008639379293
    
 Epoch 21 iteration 0 loss 1.2679147720336914
    
 Epoch 21 iteration 100 loss 2.1024975776672363
    
 Epoch 21 iteration 200 loss 1.3479344844818115
    
 Epoch 21 Training loss 2.124773566655596
    
 Epoch 22 iteration 0 loss 1.2715562582015991
    
 Epoch 22 iteration 100 loss 2.0454132556915283
    
 Epoch 22 iteration 200 loss 1.2550404071807861
    
 Epoch 22 Training loss 2.0813773049198834
    
 Epoch 23 iteration 0 loss 1.204933762550354
    
 Epoch 23 iteration 100 loss 1.986390471458435
    
 Epoch 23 iteration 200 loss 1.3080803155899048
    
 Epoch 23 Training loss 2.035502688247159
    
 Epoch 24 iteration 0 loss 1.1525975465774536
    
 Epoch 24 iteration 100 loss 2.010538101196289
    
 Epoch 24 iteration 200 loss 1.2282871007919312
    
 Epoch 24 Training loss 1.9932144449453215
    
 Epoch 25 iteration 0 loss 1.1036208868026733
    
 Epoch 25 iteration 100 loss 1.9166961908340454
    
 Epoch 25 iteration 200 loss 1.1343692541122437
    
 Epoch 25 Training loss 1.9600739742604965
    
 Evaluation loss 2.8176820923223045
    
 Epoch 26 iteration 0 loss 1.126081109046936
    
 Epoch 26 iteration 100 loss 1.8861745595932007
    
 Epoch 26 iteration 200 loss 1.1452618837356567
    
 Epoch 26 Training loss 1.9179931864284319
    
 Epoch 27 iteration 0 loss 1.0936931371688843
    
 Epoch 27 iteration 100 loss 1.8307372331619263
    
 Epoch 27 iteration 200 loss 1.1571146249771118
    
 Epoch 27 Training loss 1.8831396913691085
    
 Epoch 28 iteration 0 loss 1.0479011535644531
    
 Epoch 28 iteration 100 loss 1.8134833574295044
    
 Epoch 28 iteration 200 loss 1.1056196689605713
    
 Epoch 28 Training loss 1.8484488868290145
    
 Epoch 29 iteration 0 loss 1.0205118656158447
    
 Epoch 29 iteration 100 loss 1.821661353111267
    
 Epoch 29 iteration 200 loss 1.0737680196762085
    
 Epoch 29 Training loss 1.8186136229030332
    
 Epoch 30 iteration 0 loss 0.9615429043769836
    
 Epoch 30 iteration 100 loss 1.7652055025100708
    
 Epoch 30 iteration 200 loss 0.9891017079353333
    
 Epoch 30 Training loss 1.7838154237577641
    
 Evaluation loss 2.791978492601989
    
 Epoch 31 iteration 0 loss 0.9656916856765747
    
 Epoch 31 iteration 100 loss 1.7245019674301147
    
 Epoch 31 iteration 200 loss 1.0227261781692505
    
 Epoch 31 Training loss 1.7579890261914233
    
 Epoch 32 iteration 0 loss 0.950885534286499
    
 Epoch 32 iteration 100 loss 1.7047593593597412
    
 Epoch 32 iteration 200 loss 1.0126252174377441
    
 Epoch 32 Training loss 1.7265817618896626
    
 Epoch 33 iteration 0 loss 0.9383729696273804
    
 Epoch 33 iteration 100 loss 1.7073816061019897
    
 Epoch 33 iteration 200 loss 0.9319257736206055
    
 Epoch 33 Training loss 1.701657226905382
    
 Epoch 34 iteration 0 loss 0.8925782442092896
    
 Epoch 34 iteration 100 loss 1.6764633655548096
    
 Epoch 34 iteration 200 loss 0.9110333323478699
    
 Epoch 34 Training loss 1.6714374329267176
    
 Epoch 35 iteration 0 loss 0.9124199748039246
    
 Epoch 35 iteration 100 loss 1.5932414531707764
    
 Epoch 35 iteration 200 loss 0.9045222997665405
    
 Epoch 35 Training loss 1.6459569074645013
    
 Evaluation loss 2.7976669954047697
    
 Epoch 36 iteration 0 loss 0.8820086121559143
    
 Epoch 36 iteration 100 loss 1.5867435932159424
    
 Epoch 36 iteration 200 loss 0.88615483045578
    
 Epoch 36 Training loss 1.6248752288905044
    
 Epoch 37 iteration 0 loss 0.8861231803894043
    
 Epoch 37 iteration 100 loss 1.540147304534912
    
 Epoch 37 iteration 200 loss 0.8625170588493347
    
 Epoch 37 Training loss 1.6025891727084938
    
 Epoch 38 iteration 0 loss 0.8272038698196411
    
 Epoch 38 iteration 100 loss 1.5469865798950195
    
 Epoch 38 iteration 200 loss 0.8701044321060181
    
 Epoch 38 Training loss 1.5775597927062583
    
 Epoch 39 iteration 0 loss 0.7841694951057434
    
 Epoch 39 iteration 100 loss 1.587996244430542
    
 Epoch 39 iteration 200 loss 0.8621845245361328
    
 Epoch 39 Training loss 1.5550835649611023
    
 Epoch 40 iteration 0 loss 0.7730535268783569
    
 Epoch 40 iteration 100 loss 1.510125756263733
    
 Epoch 40 iteration 200 loss 0.8023701906204224
    
 Epoch 40 Training loss 1.536449474043806
    
 Evaluation loss 2.794806465695927
    
 Epoch 41 iteration 0 loss 0.8037686347961426
    
 Epoch 41 iteration 100 loss 1.4897831678390503
    
 Epoch 41 iteration 200 loss 0.791727602481842
    
 Epoch 41 Training loss 1.5090646408452422
    
 Epoch 42 iteration 0 loss 0.7824649214744568
    
 Epoch 42 iteration 100 loss 1.4806140661239624
    
 Epoch 42 iteration 200 loss 0.7969489693641663
    
 Epoch 42 Training loss 1.4928973876534222
    
 Epoch 43 iteration 0 loss 0.7667363286018372
    
 Epoch 43 iteration 100 loss 1.4101524353027344
    
 Epoch 43 iteration 200 loss 0.7620548009872437
    
 Epoch 43 Training loss 1.4743025649328945
    
 Epoch 44 iteration 0 loss 0.7359268069267273
    
 Epoch 44 iteration 100 loss 1.3919748067855835
    
 Epoch 44 iteration 200 loss 0.8053562045097351
    
 Epoch 44 Training loss 1.4554574874191657
    
 Epoch 45 iteration 0 loss 0.7237775921821594
    
 Epoch 45 iteration 100 loss 1.3988888263702393
    
 Epoch 45 iteration 200 loss 0.7393531203269958
    
 Epoch 45 Training loss 1.4322836776244472
    
 Evaluation loss 2.812571211478882
    
 Epoch 46 iteration 0 loss 0.6948044300079346
    
 Epoch 46 iteration 100 loss 1.304335594177246
    
 Epoch 46 iteration 200 loss 0.689096987247467
    
 Epoch 46 Training loss 1.4196053662905366
    
 Epoch 47 iteration 0 loss 0.6662931442260742
    
 Epoch 47 iteration 100 loss 1.3609318733215332
    
 Epoch 47 iteration 200 loss 0.7002820372581482
    
 Epoch 47 Training loss 1.4011935120614474
    
 Epoch 48 iteration 0 loss 0.753171443939209
    
 Epoch 48 iteration 100 loss 1.290736436843872
    
 Epoch 48 iteration 200 loss 0.6648774147033691
    
 Epoch 48 Training loss 1.3849073988196539
    
 Epoch 49 iteration 0 loss 0.7202473878860474
    
 Epoch 49 iteration 100 loss 1.3155896663665771
    
 Epoch 49 iteration 200 loss 0.7304859757423401
    
 Epoch 49 Training loss 1.3667800886861978
    
 Epoch 50 iteration 0 loss 0.6739968061447144
    
 Epoch 50 iteration 100 loss 1.3187365531921387
    
 Epoch 50 iteration 200 loss 0.6818186044692993
    
 Epoch 50 Training loss 1.3522975228605367
    
 Evaluation loss 2.8305587463367226
    
 Epoch 51 iteration 0 loss 0.7073860168457031
    
 Epoch 51 iteration 100 loss 1.3020031452178955
    
 Epoch 51 iteration 200 loss 0.6439692974090576
    
 Epoch 51 Training loss 1.3355847990987002
    
 Epoch 52 iteration 0 loss 0.7059903144836426
    
 Epoch 52 iteration 100 loss 1.3240293264389038
    
 Epoch 52 iteration 200 loss 0.6690763831138611
    
 Epoch 52 Training loss 1.3210225660783441
    
 Epoch 53 iteration 0 loss 0.6332668662071228
    
 Epoch 53 iteration 100 loss 1.2513703107833862
    
 Epoch 53 iteration 200 loss 0.6558292508125305
    
 Epoch 53 Training loss 1.3107876620531327
    
 Epoch 54 iteration 0 loss 0.6457605957984924
    
 Epoch 54 iteration 100 loss 1.246716856956482
    
 Epoch 54 iteration 200 loss 0.6521980166435242
    
 Epoch 54 Training loss 1.2941160204924305
    
 Epoch 55 iteration 0 loss 0.6227668523788452
    
 Epoch 55 iteration 100 loss 1.2278225421905518
    
 Epoch 55 iteration 200 loss 0.6727674007415771
    
 Epoch 55 Training loss 1.2778384867442392
    
 Evaluation loss 2.853066331010339
    
 Epoch 56 iteration 0 loss 0.5656446814537048
    
 Epoch 56 iteration 100 loss 1.2470365762710571
    
 Epoch 56 iteration 200 loss 0.6154574751853943
    
 Epoch 56 Training loss 1.2628238236702862
    
 Epoch 57 iteration 0 loss 0.5883901119232178
    
 Epoch 57 iteration 100 loss 1.220670461654663
    
 Epoch 57 iteration 200 loss 0.5693823099136353
    
 Epoch 57 Training loss 1.2493639340990528
    
 Epoch 58 iteration 0 loss 0.5862078666687012
    
 Epoch 58 iteration 100 loss 1.1798666715621948
    
 Epoch 58 iteration 200 loss 0.6039236187934875
    
 Epoch 58 Training loss 1.233422517480705
    
 Epoch 59 iteration 0 loss 0.5904982686042786
    
 Epoch 59 iteration 100 loss 1.1922262907028198
    
 Epoch 59 iteration 200 loss 0.5879594087600708
    
 Epoch 59 Training loss 1.2254928604160356
    
 Epoch 60 iteration 0 loss 0.5759232640266418
    
 Epoch 60 iteration 100 loss 1.153181791305542
    
 Epoch 60 iteration 200 loss 0.5618763566017151
    
 Epoch 60 Training loss 1.208009701754125
    
 Evaluation loss 2.871801325149645
    
 Epoch 61 iteration 0 loss 0.5813993215560913
    
 Epoch 61 iteration 100 loss 1.1644539833068848
    
 Epoch 61 iteration 200 loss 0.574725329875946
    
 Epoch 61 Training loss 1.1981734446603696
    
 Epoch 62 iteration 0 loss 0.54474276304245
    
 Epoch 62 iteration 100 loss 1.172760248184204
    
 Epoch 62 iteration 200 loss 0.5736648440361023
    
 Epoch 62 Training loss 1.1898703442169898
    
 Epoch 63 iteration 0 loss 0.5367869138717651
    
 Epoch 63 iteration 100 loss 1.1455975770950317
    
 Epoch 63 iteration 200 loss 0.5316013097763062
    
 Epoch 63 Training loss 1.17624104425602
    
 Epoch 64 iteration 0 loss 0.5965208411216736
    
 Epoch 64 iteration 100 loss 1.0865147113800049
    
 Epoch 64 iteration 200 loss 0.5165320634841919
    
 Epoch 64 Training loss 1.1626691673104586
    
 Epoch 65 iteration 0 loss 0.5757507085800171
    
 Epoch 65 iteration 100 loss 1.0935884714126587
    
 Epoch 65 iteration 200 loss 0.5055180191993713
    
 Epoch 65 Training loss 1.1486647791128823
    
 Evaluation loss 2.888705662914898
    
 Epoch 66 iteration 0 loss 0.554165244102478
    
 Epoch 66 iteration 100 loss 1.0687988996505737
    
 Epoch 66 iteration 200 loss 0.5742641687393188
    
 Epoch 66 Training loss 1.137105361580985
    
 Epoch 67 iteration 0 loss 0.5457087755203247
    
 Epoch 67 iteration 100 loss 1.0431346893310547
    
 Epoch 67 iteration 200 loss 0.5005226731300354
    
 Epoch 67 Training loss 1.1251085623172112
    
 Epoch 68 iteration 0 loss 0.5115629434585571
    
 Epoch 68 iteration 100 loss 1.0742378234863281
    
 Epoch 68 iteration 200 loss 0.4768718481063843
    
 Epoch 68 Training loss 1.1169700110112382
    
 Epoch 69 iteration 0 loss 0.5225317478179932
    
 Epoch 69 iteration 100 loss 1.041317343711853
    
 Epoch 69 iteration 200 loss 0.534132719039917
    
 Epoch 69 Training loss 1.1102069269037087
    
 Epoch 70 iteration 0 loss 0.48191702365875244
    
 Epoch 70 iteration 100 loss 1.0193127393722534
    
 Epoch 70 iteration 200 loss 0.4716692566871643
    
 Epoch 70 Training loss 1.0953487060532974
    
 Evaluation loss 2.9113613200675643
    
 Epoch 71 iteration 0 loss 0.59366375207901
    
 Epoch 71 iteration 100 loss 1.042155146598816
    
 Epoch 71 iteration 200 loss 0.45154234766960144
    
 Epoch 71 Training loss 1.091857606453407
    
 Epoch 72 iteration 0 loss 0.5238001346588135
    
 Epoch 72 iteration 100 loss 1.027955174446106
    
 Epoch 72 iteration 200 loss 0.5312687754631042
    
 Epoch 72 Training loss 1.0819147441571477
    
 Epoch 73 iteration 0 loss 0.5490065217018127
    
 Epoch 73 iteration 100 loss 1.0117655992507935
    
 Epoch 73 iteration 200 loss 0.5065831542015076
    
 Epoch 73 Training loss 1.0687738825424347
    
 Epoch 74 iteration 0 loss 0.5063045024871826
    
 Epoch 74 iteration 100 loss 1.0293574333190918
    
 Epoch 74 iteration 200 loss 0.5003397464752197
    
 Epoch 74 Training loss 1.0547682162543772
    
 Epoch 75 iteration 0 loss 0.45235222578048706
    
 Epoch 75 iteration 100 loss 1.0297720432281494
    
 Epoch 75 iteration 200 loss 0.4086465835571289
    
 Epoch 75 Training loss 1.0492441391159522
    
 Evaluation loss 2.945518095083358
    
 Epoch 76 iteration 0 loss 0.46895310282707214
    
 Epoch 76 iteration 100 loss 0.9821916818618774
    
 Epoch 76 iteration 200 loss 0.48269033432006836
    
 Epoch 76 Training loss 1.0391477853463758
    
 Epoch 77 iteration 0 loss 0.4749329388141632
    
 Epoch 77 iteration 100 loss 0.9370260238647461
    
 Epoch 77 iteration 200 loss 0.5174757242202759
    
 Epoch 77 Training loss 1.0302731247109642
    
 Epoch 78 iteration 0 loss 0.4239536225795746
    
 Epoch 78 iteration 100 loss 0.982223391532898
    
 Epoch 78 iteration 200 loss 0.46800896525382996
    
 Epoch 78 Training loss 1.02385489594265
    
 Epoch 79 iteration 0 loss 0.5065938830375671
    
 Epoch 79 iteration 100 loss 0.9628017544746399
    
 Epoch 79 iteration 200 loss 0.4790896773338318
    
 Epoch 79 Training loss 1.014064338724403
    
 Epoch 80 iteration 0 loss 0.43752557039260864
    
 Epoch 80 iteration 100 loss 0.8520130515098572
    
 Epoch 80 iteration 200 loss 0.40985599160194397
    
 Epoch 80 Training loss 1.002772340443797
    
 Evaluation loss 2.9621174652470703
    
 Epoch 81 iteration 0 loss 0.44454529881477356
    
 Epoch 81 iteration 100 loss 0.9402937293052673
    
 Epoch 81 iteration 200 loss 0.41907238960266113
    
 Epoch 81 Training loss 0.9969750344440632
    
 Epoch 82 iteration 0 loss 0.4125458896160126
    
 Epoch 82 iteration 100 loss 0.9050692915916443
    
 Epoch 82 iteration 200 loss 0.5123288035392761
    
 Epoch 82 Training loss 0.989270733289982
    
 Epoch 83 iteration 0 loss 0.4764525592327118
    
 Epoch 83 iteration 100 loss 0.9303292632102966
    
 Epoch 83 iteration 200 loss 0.44956347346305847
    
 Epoch 83 Training loss 0.9836232322264327
    
 Epoch 84 iteration 0 loss 0.48803961277008057
    
 Epoch 84 iteration 100 loss 0.9711679816246033
    
 Epoch 84 iteration 200 loss 0.44382917881011963
    
 Epoch 84 Training loss 0.9754019522005947
    
 Epoch 85 iteration 0 loss 0.46858376264572144
    
 Epoch 85 iteration 100 loss 0.9077855944633484
    
 Epoch 85 iteration 200 loss 0.4368401765823364
    
 Epoch 85 Training loss 0.9719701637084417
    
 Evaluation loss 2.990323471814928
    
 Epoch 86 iteration 0 loss 0.4658893346786499
    
 Epoch 86 iteration 100 loss 0.8741357326507568
    
 Epoch 86 iteration 200 loss 0.423090398311615
    
 Epoch 86 Training loss 0.9583479015194021
    
 Epoch 87 iteration 0 loss 0.4344865381717682
    
 Epoch 87 iteration 100 loss 0.8711681365966797
    
 Epoch 87 iteration 200 loss 0.41789063811302185
    
 Epoch 87 Training loss 0.9474942575734959
    
 Epoch 88 iteration 0 loss 0.42888087034225464
    
 Epoch 88 iteration 100 loss 0.8649926781654358
    
 Epoch 88 iteration 200 loss 0.4007169306278229
    
 Epoch 88 Training loss 0.9426996659812006
    
 Epoch 89 iteration 0 loss 0.4257383942604065
    
 Epoch 89 iteration 100 loss 0.8543802499771118
    
 Epoch 89 iteration 200 loss 0.41755053400993347
    
 Epoch 89 Training loss 0.9360784180891997
    
 Epoch 90 iteration 0 loss 0.44567570090293884
    
 Epoch 90 iteration 100 loss 0.8825702667236328
    
 Epoch 90 iteration 200 loss 0.41934728622436523
    
 Epoch 90 Training loss 0.9298315100552865
    
 Evaluation loss 3.0115221658685347
    
 Epoch 91 iteration 0 loss 0.4208157956600189
    
 Epoch 91 iteration 100 loss 0.813216507434845
    
 Epoch 91 iteration 200 loss 0.4040917158126831
    
 Epoch 91 Training loss 0.9193997417003693
    
 Epoch 92 iteration 0 loss 0.41099944710731506
    
 Epoch 92 iteration 100 loss 0.8445271253585815
    
 Epoch 92 iteration 200 loss 0.3656329810619354
    
 Epoch 92 Training loss 0.9176739377176427
    
 Epoch 93 iteration 0 loss 0.3757087290287018
    
 Epoch 93 iteration 100 loss 0.8153252601623535
    
 Epoch 93 iteration 200 loss 0.3429928421974182
    
 Epoch 93 Training loss 0.908510602970967
    
 Epoch 94 iteration 0 loss 0.42818954586982727
    
 Epoch 94 iteration 100 loss 0.8111163377761841
    
 Epoch 94 iteration 200 loss 0.4069685935974121
    
 Epoch 94 Training loss 0.902406791391548
    
 Epoch 95 iteration 0 loss 0.37496259808540344
    
 Epoch 95 iteration 100 loss 0.7711942195892334
    
 Epoch 95 iteration 200 loss 0.4711993336677551
    
 Epoch 95 Training loss 0.8950450409158558
    
 Evaluation loss 3.034074325896889
    
 Epoch 96 iteration 0 loss 0.3465866148471832
    
 Epoch 96 iteration 100 loss 0.7963153123855591
    
 Epoch 96 iteration 200 loss 0.34403669834136963
    
 Epoch 96 Training loss 0.8901747859619997
    
 Epoch 97 iteration 0 loss 0.40915727615356445
    
 Epoch 97 iteration 100 loss 0.8184841275215149
    
 Epoch 97 iteration 200 loss 0.39140430092811584
    
 Epoch 97 Training loss 0.883020128311112
    
 Epoch 98 iteration 0 loss 0.35649484395980835
    
 Epoch 98 iteration 100 loss 0.858453094959259
    
 Epoch 98 iteration 200 loss 0.3666226267814636
    
 Epoch 98 Training loss 0.8780363934074935
    
 Epoch 99 iteration 0 loss 0.41814950108528137
    
 Epoch 99 iteration 100 loss 0.8482405543327332
    
 Epoch 99 iteration 200 loss 0.3461854159832001
    
 Epoch 99 Training loss 0.8755297044370204

9. 调用上面的translate_dev函数

复制代码
 for i in range(100,120):

    
     translate_dev(i)
    
     print()
复制代码
 BOS you have nice skin . EOS

    
 BOS 你 的 皮 膚 真 好 。 EOS
    
 /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:33: UserWarning: masked_fill_ received a mask with dtype torch.uint8, this behavior is now deprecated,please use a mask with dtype torch.bool instead. (Triggered internally at  /pytorch/aten/src/ATen/native/cuda/LegacyDefinitions.cpp:19.)
    
 你最好有很多新鲜事。
    
  
    
 BOS you 're UNK correct . EOS
    
 BOS 你 部 分 正 确 。 EOS
    
 你的生身。
    
   10. BOS everyone admired his courage . EOS
    
 BOS 每 個 人 都 佩 服 他 的 勇 氣 。 EOS
    
 每個人都認釋了他的意見。
    
   14. BOS what time is it ? EOS
    
 BOS 几 点 了 ? EOS
    
 多少钱?
    
   18. BOS i 'm free tonight . EOS
    
 BOS 我 今 晚 有 空 。 EOS
    
 我今晚有空。
    
  
    
 BOS here is your book . EOS
    
 BOS 這 是 你 的 書 。 EOS
    
 你的書在這裡。
    
  
    
 BOS they are at lunch . EOS
    
 BOS 他 们 在 吃 午 饭 。 EOS
    
 他们午吃午饭。
    
  
    
 BOS this chair is UNK . EOS
    
 BOS 這 把 椅 子 很 UNK 。 EOS
    
 这里的发生是门。
    
  
    
 BOS it 's pretty heavy . EOS
    
 BOS 它 真 重 。 EOS
    
 它是居机场的。
    
   38. BOS many attended his funeral . EOS
    
 BOS 很 多 人 都 参 加 了 他 的 葬 礼 。 EOS
    
 每个人都知道他的音樂。
    
   42. BOS training will be provided . EOS
    
 BOS 会 有 训 练 。 EOS
    
 即待有空光。
    
   46. BOS someone is watching you . EOS
    
 BOS 有 人 在 看 著 你 。 EOS
    
 有人在看你。
    
   50. BOS i slapped his face . EOS
    
 BOS 我 摑 了 他 的 臉 。 EOS
    
 我愛他打斷了。
    
   54. BOS i like UNK music . EOS
    
 BOS 我 喜 歡 流 行 音 樂 。 EOS
    
 我喜欢阅读。
    
   58. BOS tom had no children . EOS
    
 BOS T o m 沒 有 孩 子 。 EOS
    
 汤姆没有孩子。
    
   62. BOS please lock the door . EOS
    
 BOS 請 把 門 鎖 上 。 EOS
    
 請關門門。
    
   66. BOS tom has calmed down . EOS
    
 BOS 汤 姆 冷 静 下 来 了 。 EOS
    
 Tom有三個走。
    
   70. BOS please speak more loudly . EOS
    
 BOS 請 說 大 聲 一 點 兒 。 EOS
    
 請講更多的聲外。
    
   74. BOS keep next sunday free . EOS
    
 BOS 把 下 周 日 空 出 来 。 EOS
    
 下個星期一下吧。
    
   78. BOS i made a mistake . EOS
    
 BOS 我 犯 了 一 個 錯 。 EOS
    
 我错了錯誤。

全部评论 (0)

还没有任何评论哟~