4.1 机器翻译


def preprocess_raw(text): #\xa0是latin1里的扩展字符,代表不间断空白符 text = text.replace('\u202f', ' ').replace('\xa0', ' ') out = '' for i, char in enumerate(text.lower()):#改为小写 if char in (',', '!', '.') and i > 0 and text[i-1] != ' ': out += ' ' out += char return out


num_examples = 50000 source, target = [], [] for i, line in enumerate(text.split('\n')): if i > num_examples: break parts = line.split('\t') if len(parts) >= 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' '))


def build_vocab(tokens): #取出所有单词 tokens = [token for line in tokens for token in line] #调用工具函数 return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True)


def pad(line, max_len, padding_token): if len(line) > max_len: return line[:max_len] return line + [padding_token] * (max_len - len(line)) pad(src_vocab[source[0]], 10, src_vocab.pad) def build_array(lines, vocab, max_len, is_source): lines = [vocab[line] for line in lines] if not is_source: lines = [[vocab.bos] + line + [vocab.eos] for line in lines] array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines]) valid_len = (array != vocab.pad).sum(1) #第一个维度 return array, valid_len def load_data_nmt(batch_size, max_len): # This function is saved in d2l. src_vocab, tgt_vocab = build_vocab(source), build_vocab(target) src_array, src_valid_len = build_array(source, src_vocab, max_len, True) tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False) train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len) train_iter = data.DataLoader(train_data, batch_size, shuffle=True) return src_vocab, tgt_vocab, train_iter


class Encoder(nn.Module): def __init__(self, **kwargs): super(Encoder, self).__init__(**kwargs) def forward(self, X, *args): raise NotImplementedError class Decoder(nn.Module): def __init__(self, **kwargs): super(Decoder, self).__init__(**kwargs) def init_state(self, enc_outputs, *args): raise NotImplementedError def forward(self, X, state): raise NotImplementedError class EncoderDecoder(nn.Module): def __init__(self, encoder, decoder, **kwargs): super(EncoderDecoder, self).__init__(**kwargs) self.encoder = encoder self.decoder = decoder def forward(self, enc_X, dec_X, *args): enc_outputs = self.encoder(enc_X, *args) dec_state = self.decoder.init_state(enc_outputs, *args) return self.decoder(dec_X, dec_state)

Sequence to Sequence模型

class Seq2SeqEncoder(d2l.Encoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqEncoder, self).__init__(**kwargs) self.num_hiddens=num_hiddens self.num_layers=num_layers self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout) def begin_state(self, batch_size, device): return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device), torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)] def forward(self, X, *args): X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size) X = X.transpose(0, 1) # RNN needs first axes to be time # state = self.begin_state(X.shape[1], device=X.device) out, state = self.rnn(X) # The shape of out is (seq_len, batch_size, num_hiddens). # state contains the hidden state and the memory cell # of the last time step, the shape is (num_layers, batch_size, num_hiddens) return out, state class Seq2SeqDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqDecoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout) self.dense = nn.Linear(num_hiddens,vocab_size) def init_state(self, enc_outputs, *args): return enc_outputs[1] def forward(self, X, state): X = self.embedding(X).transpose(0, 1) out, state = self.rnn(X, state) # Make the batch to be the first dimension to simplify loss computation. out = self.dense(out).transpose(0, 1) return out, state 4.2 注意力机制和Seq2seq模型

输入由两部分构成:询问(query)和键值对(key-value pairs)。
attention layer 会与每一个key计算注意力分散并进行权重的归一化,输出的向量是value的加权求和,而每个key计算的权重与value一一对应。
假设query和keys有相同的维度,通过计算query和key转置的乘积来计算attention score,通常还会除去d\sqrt{d}d​来减少计算出来的score对维度d的依赖性

# Save to the d2l package. class DotProductAttention(nn.Module): def __init__(self, dropout, **kwargs): super(DotProductAttention, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) # query: (batch_size, #queries, d) # key: (batch_size, #kv_pairs, d) # value: (batch_size, #kv_pairs, dim_v) # valid_length: either (batch_size, ) or (batch_size, xx) def forward(self, query, key, value, valid_length=None): d = query.shape[-1] # set transpose_b=True to swap the last two dimensions of key scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d) attention_weights = self.dropout(masked_softmax(scores, valid_length)) print("attention_weight\n",attention_weights) return torch.bmm(attention_weights, value)

然后将key和value在特征的维度上合并,送至 a single hidden layer perceptron这层中

# Save to the d2l package. class MLPAttention(nn.Module): def __init__(self, units,ipt_dim,dropout, **kwargs): super(MLPAttention, self).__init__(**kwargs) # Use flatten=True to keep query's and key's 3-D shapes. self.W_k = nn.Linear(ipt_dim, units, bias=False) self.W_q = nn.Linear(ipt_dim, units, bias=False) self.v = nn.Linear(units, 1, bias=False) self.dropout = nn.Dropout(dropout) def forward(self, query, key, value, valid_length): query, key = self.W_k(query), self.W_q(key) #print("size",query.size(),key.size()) # expand query to (batch_size, #querys, 1, units), and key to # (batch_size, 1, #kv_pairs, units). Then plus them with broadcast. features = query.unsqueeze(2) + key.unsqueeze(1) #print("features:",features.size()) #--------------开启 scores = self.v(features).squeeze(-1) attention_weights = self.dropout(masked_softmax(scores, valid_length)) return torch.bmm(attention_weights, value)


class Seq2SeqAttentionDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqAttentionDecoder, self).__init__(**kwargs) self.attention_cell = MLPAttention(num_hiddens,num_hiddens, dropout) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size+ num_hiddens,num_hiddens, num_layers, dropout=dropout) self.dense = nn.Linear(num_hiddens,vocab_size) def init_state(self, enc_outputs, enc_valid_len, *args): outputs, hidden_state = enc_outputs # print("first:",outputs.size(),hidden_state[0].size(),hidden_state[1].size()) # Transpose outputs to (batch_size, seq_len, hidden_size) return (outputs.permute(1,0,-1), hidden_state, enc_valid_len) #outputs.swapaxes(0, 1) def forward(self, X, state): enc_outputs, hidden_state, enc_valid_len = state #("X.size",X.size()) X = self.embedding(X).transpose(0,1) # print("Xembeding.size2",X.size()) outputs = [] for l, x in enumerate(X): # print(f"\n{l}-th token") # print("x.first.size()",x.size()) # query shape: (batch_size, 1, hidden_size) # select hidden state of the last rnn layer as query query = hidden_state[0][-1].unsqueeze(1) # np.expand_dims(hidden_state[0][-1], axis=1) # context has same shape as query # print("query enc_outputs, enc_outputs:\n",query.size(), enc_outputs.size(), enc_outputs.size()) context = self.attention_cell(query, enc_outputs, enc_outputs, enc_valid_len) # Concatenate on the feature dimension # print("context.size:",context.size()) x = torch.cat((context, x.unsqueeze(1)), dim=-1) # Reshape x to (1, batch_size, embed_size+hidden_size) # print("rnn",x.size(), len(hidden_state)) out, hidden_state = self.rnn(x.transpose(0,1), hidden_state) outputs.append(out) outputs = self.dense(torch.cat(outputs, dim=0)) return outputs.transpose(0, 1), [enc_outputs, hidden_state, enc_valid_len] 4.3 Transformer

将seq2seq模型中的循环网络替换为Transformer Blocks,该模块包含了一个多头注意力层以及两个position-wise feed-forward networks。多头注意力层包含h个并行的自注意力层,每一个这种层成为一个头。对每个头来说,在进行注意力计算之前,会将query,key和value三个现行层进行映射,这h个注意力头的输出将会拼接之后输入最后一个线性层进行整合。

class MultiHeadAttention(nn.Module): def __init__(self, input_size, hidden_size, num_heads, dropout, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self.num_heads = num_heads self.attention = DotProductAttention(dropout) self.W_q = nn.Linear(input_size, hidden_size, bias=False) self.W_k = nn.Linear(input_size, hidden_size, bias=False) self.W_v = nn.Linear(input_size, hidden_size, bias=False) self.W_o = nn.Linear(hidden_size, hidden_size, bias=False) def forward(self, query, key, value, valid_length): # query, key, and value shape: (batch_size, seq_len, dim), # where seq_len is the length of input sequence # valid_length shape is either (batch_size, ) # or (batch_size, seq_len). # Project and transpose query, key, and value from # (batch_size, seq_len, hidden_size * num_heads) to # (batch_size * num_heads, seq_len, hidden_size). query = transpose_qkv(self.W_q(query), self.num_heads) key = transpose_qkv(self.W_k(key), self.num_heads) value = transpose_qkv(self.W_v(value), self.num_heads) if valid_length is not None: # Copy valid_length by num_heads times device = valid_length.device valid_length = valid_length.cpu().numpy() if valid_length.is_cuda else valid_length.numpy() if valid_length.ndim == 1: valid_length = torch.FloatTensor(np.tile(valid_length, self.num_heads)) else: valid_length = torch.FloatTensor(np.tile(valid_length, (self.num_heads,1))) valid_length = valid_length.to(device) output = self.attention(query, key, value, valid_length) output_concat = transpose_output(output, self.num_heads) return self.W_o(output_concat)

多头注意力层和前馈网络的输出被送到两个"add and norm"层进行处理,该层包含残差结构以及层归一化

class PositionWiseFFN(nn.Module): def __init__(self, input_size, ffn_hidden_size, hidden_size_out, **kwargs): super(PositionWiseFFN, self).__init__(**kwargs) self.ffn_1 = nn.Linear(input_size, ffn_hidden_size) self.ffn_2 = nn.Linear(ffn_hidden_size, hidden_size_out) def forward(self, X): return self.ffn_2(F.relu(self.ffn_1(X))) class AddNorm(nn.Module): def __init__(self, hidden_size, dropout, **kwargs): super(AddNorm, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) self.norm = nn.LayerNorm(hidden_size) def forward(self, X, Y): return self.norm(self.dropout(Y) + X)

Transformer 模型引入了位置编码去保持输入序列元素的位置

class PositionalEncoding(nn.Module): def __init__(self, embedding_size, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) self.P = np.zeros((1, max_len, embedding_size)) X = np.arange(0, max_len).reshape(-1, 1) / np.power( 10000, np.arange(0, embedding_size, 2)/embedding_size) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X) self.P = torch.FloatTensor(self.P) def forward(self, X): if X.is_cuda and not self.P.is_cuda: self.P = self.P.cuda() X = X + self.P[:, :X.shape[1], :] return self.dropout(X)

