Sync with d2l-en v0.17.0 (d2l-ai#971)

* sync 0.17.0 ch1-14
marble234 · Sep 30, 2021 · c833e6c · c833e6c
1 parent 3eb7fc9
commit c833e6c
Show file tree

Hide file tree

Showing 36 changed files with 1,471 additions and 165 deletions.
diff --git a/chapter_attention-mechanisms/attention-cues.md b/chapter_attention-mechanisms/attention-cues.md
@@ -65,6 +65,7 @@ import tensorflow as tf
 #@save
 def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5),
                   cmap='Reds'):
+    """显示矩阵的热图。"""
     d2l.use_svg_display()
     num_rows, num_cols = matrices.shape[0], matrices.shape[1]
     fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize,

diff --git a/chapter_attention-mechanisms/attention-scoring-functions.md b/chapter_attention-mechanisms/attention-scoring-functions.md
@@ -1,7 +1,7 @@
 # 注意力评分函数
 :label:`sec_attention-scoring-functions`
 
-在 :numref:`sec_nadaraya-waston`中，我们使用高斯核来对查询和键之间的关系建模。可以将 :eqref:`eq_nadaraya-waston-gaussian`中的高斯核的指数部分视为*注意力评分函数*（attention scoring function），简称*评分函数*（scoring function），然后把这个函数的输出结果输入到softmax函数中进行运算。通过上述步骤，我们将得到与键对应的值的概率分布（即注意力权重）。最后，注意力汇聚的输出就是基于这些注意力权重的值的加权和。
+在 :numref:`sec_nadaraya-watson`中，我们使用高斯核来对查询和键之间的关系建模。可以将 :eqref:`eq_nadaraya-watson-gaussian`中的高斯核的指数部分视为*注意力评分函数*（attention scoring function），简称*评分函数*（scoring function），然后把这个函数的输出结果输入到softmax函数中进行运算。通过上述步骤，我们将得到与键对应的值的概率分布（即注意力权重）。最后，注意力汇聚的输出就是基于这些注意力权重的值的加权和。
 
 从宏观来看，可以使用上述算法来实现 :numref:`fig_qkv`中的注意力机制框架。 :numref:`fig_attention_output`说明了如何将注意力汇聚的输出计算成为值的加权和，其中$a$表示注意力评分函数。由于注意力权重是概率分布，因此加权和其本质上是加权平均值。
 
@@ -36,6 +36,13 @@ import torch
 from torch import nn
 ```
 
+```{.python .input}
+#@tab tensorflow
+from d2l import tensorflow as d2l
+import tensorflow as tf
+```
+
+
 ## [**遮蔽softmax操作**]
 
 正如上面提到的，softmax运算用于输出一个概率分布作为注意力权重。在某些情况下，并非所有的值都应该被纳入到注意力汇聚中。例如，为了在 :numref:`sec_machine_translation`中高效处理小批量数据集，某些文本序列被填充了没有意义的特殊词元。为了仅将有意义的词元作为值去获取注意力汇聚，可以指定一个有效序列长度（即词元的个数），以便在计算softmax时过滤掉超出指定范围的位置。通过这种方式，我们可以在下面的`masked_softmax`函数中实现这样的*遮蔽softmax操作*（masked softmax operation），其中任何超出有效长度的位置都被遮蔽并置为0。
@@ -79,6 +86,26 @@ def masked_softmax(X, valid_lens):
         return nn.functional.softmax(X.reshape(shape), dim=-1)
 ```
 
+```{.python .input}
+#@tab tensorflow
+#@save
+def masked_softmax(X, valid_lens):
+    """通过在最后一个轴上遮蔽元素来执行 softmax 操作"""
+    # `X`: 3D张量, `valid_lens`: 1D或2D 张量
+    if valid_lens is None:
+        return tf.nn.softmax(X, axis=-1)
+    else:
+        shape = X.shape
+        if len(valid_lens.shape) == 1:
+            valid_lens = tf.repeat(valid_lens, repeats=shape[1])
+            
+        else:
+            valid_lens = tf.reshape(valid_lens, shape=-1)
+        # 在最后的轴上，被遮蔽的元素使用一个非常大的负值替换，从而其 softmax (指数)输出为 0
+        X = d2l.sequence_mask(tf.reshape(X, shape=(-1, shape[-1])), valid_lens, value=-1e6)    
+        return tf.nn.softmax(tf.reshape(X, shape=shape), axis=-1)
+```
+
 为了[**演示此函数是如何工作**]的，考虑由两个$2 \times 4$矩阵表示的样本，这两个样本的有效长度分别为$2$和$3$。经过遮蔽softmax操作，超出有效长度的值都被遮蔽为0。
 
 ```{.python .input}
@@ -102,6 +129,11 @@ masked_softmax(np.random.uniform(size=(2, 2, 4)),
 masked_softmax(torch.rand(2, 2, 4), d2l.tensor([[1, 3], [2, 4]]))
 ```
 
+```{.python .input}
+#@tab tensorflow
+masked_softmax(tf.random.uniform(shape=(2, 2, 4)), tf.constant([2, 3]))
+```
+
 ## [**加性注意力**]
 :label:`subsec_additive-attention`
 
@@ -168,6 +200,36 @@ class AdditiveAttention(nn.Module):
         return torch.bmm(self.dropout(self.attention_weights), values)
 ```
 
+```{.python .input}
+#@tab tensorflow
+#@save
+class AdditiveAttention(tf.keras.layers.Layer):
+    """Additive attention."""
+    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
+        super().__init__(**kwargs)
+        self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=False)
+        self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=False)
+        self.w_v = tf.keras.layers.Dense(1, use_bias=False)
+        self.dropout = tf.keras.layers.Dropout(dropout)
+        
+    def call(self, queries, keys, values, valid_lens, **kwargs):
+        queries, keys = self.W_q(queries), self.W_k(keys)
+        # 在维度扩展后，
+        # `queries` 的形状：(`batch_size`, 查询的个数, 1, `num_hidden`)
+        # `key` 的形状：(`batch_size`, 1, “键－值”对的个数, `num_hiddens`)
+        # 使用广播方式进行求和
+        features = tf.expand_dims(queries, axis=2) + tf.expand_dims(
+            keys, axis=1)
+        features = tf.nn.tanh(features)
+        # `self.w_v` 仅有一个输出，因此从形状中移除最后那个维度。
+        # `scores` 的形状：(`batch_size`, 查询的个数, “键-值”对的个数)
+        scores = tf.squeeze(self.w_v(features), axis=-1)
+        self.attention_weights = masked_softmax(scores, valid_lens)
+        # `values` 的形状：(`batch_size`, “键－值”对的个数, 值的维度)
+        return tf.matmul(self.dropout(
+            self.attention_weights, **kwargs), values)
+```
+
 让我们用一个小例子来[**演示上面的`AdditiveAttention`类**]，其中查询、键和值的形状为（批量大小、步数或词元序列长度、特征大小），实际输出为$(2,1,20)$、$(2,10,2)$和$(2,10,4)$。注意力汇聚输出的形状为（批量大小、查询的步数、值的维度）。
 
 ```{.python .input}
@@ -184,7 +246,7 @@ attention(queries, keys, values, valid_lens)
 ```{.python .input}
 #@tab pytorch
 queries, keys = d2l.normal(0, 1, (2, 1, 20)), d2l.ones((2, 10, 2))
-# `values` 的小批量数据集中，两个值矩阵是相同的
+# `values` 的小批量，两个值矩阵是相同的
 values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(
     2, 1, 1)
 valid_lens = d2l.tensor([2, 6])
@@ -195,6 +257,20 @@ attention.eval()
 attention(queries, keys, values, valid_lens)
 ```
 
+```{.python .input}
+#@tab tensorflow
+queries, keys = tf.random.normal(shape=(2, 1, 20)), tf.ones((2, 10, 2))
+# `values` 的小批量，两个值矩阵是相同的
+values = tf.repeat(tf.reshape(
+    tf.range(40, dtype=tf.float32), shape=(1, 10, 4)), repeats=2, axis=0)
+valid_lens = tf.constant([2, 6])
+
+attention = AdditiveAttention(key_size=2, query_size=20, num_hiddens=8,
+                              dropout=0.1)
+attention(queries, keys, values, valid_lens, training=False)
+```
+
+
 尽管加性注意力包含了可学习的参数，但由于本例子中每个键都是相同的，所以[**注意力权重**]是均匀的，由指定的有效长度决定。
 
 ```{.python .input}
@@ -257,6 +333,27 @@ class DotProductAttention(nn.Module):
         return torch.bmm(self.dropout(self.attention_weights), values)
 ```
 
+```{.python .input}
+#@tab tensorflow
+#@save
+class DotProductAttention(tf.keras.layers.Layer):
+    """Scaled dot product attention."""
+    def __init__(self, dropout, **kwargs):
+        super().__init__(**kwargs)
+        self.dropout = tf.keras.layers.Dropout(dropout)
+        
+    # `queries` 的形状：(`batch_size`, 查询的个数, `d`)
+    # `keys` 的形状：(`batch_size`, “键－值”对的个数, `d`)
+    # `values` 的形状：(`batch_size`, “键－值”对的个数, 值的维度)
+    # `valid_lens` 的形状: (`batch_size`,) 或者 (`batch_size`, 查询的个数)
+    def call(self, queries, keys, values, valid_lens, **kwargs):
+        d = queries.shape[-1]
+        scores = tf.matmul(queries, keys, transpose_b=True)/tf.math.sqrt(
+            tf.cast(d, dtype=tf.float32))
+        self.attention_weights = masked_softmax(scores, valid_lens)
+        return tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
+```
+
 为了[**演示上述的`DotProductAttention`类**]，我们使用了与先前加性注意力例子中相同的键、值和有效长度。对于点积操作，令查询的特征维度与键的特征维度大小相同。
 
 ```{.python .input}
@@ -274,6 +371,13 @@ attention.eval()
 attention(queries, keys, values, valid_lens)
 ```
 
+```{.python .input}
+#@tab tensorflow
+queries = tf.random.normal(shape=(2, 1, 2))
+attention = DotProductAttention(dropout=0.5)
+attention(queries, keys, values, valid_lens, training=False)
+```
+
 与加性注意力演示相同，由于键包含的是相同的元素，而这些元素无法通过任何查询进行区分，因此获得了[**均匀的注意力权重**]。
 
 ```{.python .input}

diff --git a/chapter_attention-mechanisms/bahdanau-attention.md b/chapter_attention-mechanisms/bahdanau-attention.md
@@ -32,6 +32,12 @@ import torch
 from torch import nn
 ```
 
+```{.python .input}
+#@tab tensorflow
+from d2l import tensorflow as d2l
+import tensorflow as tf
+```
+
 ## 定义注意力解码器
 
 要用Bahdanau注意力实现循环神经网络编码器-解码器，我们只需重新定义解码器即可。为了更方便地显示学习的注意力权重，以下`AttentionDecoder`类定义了[**带有注意力机制的解码器基本接口**]。
@@ -151,6 +157,59 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
         return self._attention_weights
 ```
 
+```{.python .input}
+#@tab tensorflow
+class Seq2SeqAttentionDecoder(AttentionDecoder):
+    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
+                 dropout=0, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = d2l.AdditiveAttention(num_hiddens, num_hiddens,
+                                               num_hiddens, dropout)
+        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
+        self.rnn = tf.keras.layers.RNN(tf.keras.layers.StackedRNNCells(
+            [tf.keras.layers.GRUCell(num_hiddens, dropout=dropout)
+             for _ in range(num_layers)]),
+                                      return_sequences=True, return_state=True)
+        self.dense = tf.keras.layers.Dense(vocab_size)
+
+    def init_state(self, enc_outputs, enc_valid_lens, *args):
+        # `outputs`的形状为 (`num_steps`, `batch_size`, `num_hiddens`).
+        # `hidden_state[0]`的形状为 (`num_layers`, `batch_size`,
+        # `num_hiddens`)
+        outputs, hidden_state = enc_outputs
+        return (outputs, hidden_state, enc_valid_lens)
+
+    def call(self, X, state, **kwargs):
+        # `enc_outputs`的形状为 (`batch_size`, `num_steps`, `num_hiddens`).
+        # `hidden_state[0]`的形状为 (`num_layers`, `batch_size`,
+        # `num_hiddens`)
+        enc_outputs, hidden_state, enc_valid_lens = state
+        # 输出 `X`的形状为 (`num_steps`, `batch_size`, `embed_size`)
+        X = self.embedding(X) # 输入 `X`的形状为 (`batch_size`, `num_steps`)
+        X = tf.transpose(X, perm=(1, 0, 2))
+        outputs, self._attention_weights = [], []
+        for x in X:
+            # `query`的形状为 (`batch_size`, 1, `num_hiddens`)
+            query = tf.expand_dims(hidden_state[-1], axis=1)
+            # `context`的形状为 (`batch_size`, 1, `num_hiddens`)
+            context = self.attention(query, enc_outputs, enc_outputs,
+                                     enc_valid_lens, **kwargs)
+            # 在特征维度上连结
+            x = tf.concat((context, tf.expand_dims(x, axis=1)), axis=-1)
+            out = self.rnn(x, hidden_state, **kwargs)
+            hidden_state = out[1:]
+            outputs.append(out[0])
+            self._attention_weights.append(self.attention.attention_weights)
+        # 全连接层变换后， `outputs`的形状为 
+        # (`num_steps`, `batch_size`, `vocab_size`)
+        outputs = self.dense(tf.concat(outputs, axis=1))
+        return outputs, [enc_outputs, hidden_state, enc_valid_lens]
+
+    @property
+    def attention_weights(self):
+        return self._attention_weights
+```
+
 接下来，我们使用包含7个时间步的4个序列输入的小批量[**测试Bahdanau注意力解码器**]。
 
 ```{.python .input}
@@ -180,6 +239,18 @@ output, state = decoder(X, state)
 output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
 ```
 
+```{.python .input}
+#@tab tensorflow
+encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
+                             num_layers=2)
+decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
+                                  num_layers=2)
+X = tf.zeros((4, 7))
+state = decoder.init_state(encoder(X, training=False), None)
+output, state = decoder(X, state, training=False)
+output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
+```
+
 ## [**训练**]
 
 与 :numref:`sec_seq2seq_training`类似，我们在这里指定超参数，实例化一个带有Bahdanau注意力的编码器和解码器，并对这个模型进行机器翻译训练。由于新增的注意力机制，这项训练要比没有注意力机制的 :numref:`sec_seq2seq_training`慢得多。
@@ -202,7 +273,7 @@ d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
 模型训练后，我们用它[**将几个英语句子翻译成法语**]并计算它们的BLEU分数。
 
 ```{.python .input}
-#@tab all
+#@tab mxnet, pytorch
 engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
 fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
 for eng, fra in zip(engs, fras):
@@ -212,6 +283,17 @@ for eng, fra in zip(engs, fras):
           f'bleu {d2l.bleu(translation, fra, k=2):.3f}')
 ```
 
+```{.python .input}
+#@tab tensorflow
+engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
+fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
+for eng, fra in zip(engs, fras):
+    translation, dec_attention_weight_seq = d2l.predict_seq2seq(
+        net, eng, src_vocab, tgt_vocab, num_steps, True)
+    print(f'{eng} => {translation}, ',
+          f'bleu {d2l.bleu(translation, fra, k=2):.3f}')
+```
+
 ```{.python .input}
 #@tab all
 attention_weights = d2l.reshape(
@@ -225,15 +307,22 @@ attention_weights = d2l.reshape(
 # 加上一个包含序列结束词元
 d2l.show_heatmaps(
     attention_weights[:, :, :, :len(engs[-1].split()) + 1],
-    xlabel='Key posistions', ylabel='Query posistions')
+    xlabel='Key positions', ylabel='Query positions')
 ```
 
 ```{.python .input}
 #@tab pytorch
 # 加上一个包含序列结束词元
 d2l.show_heatmaps(
     attention_weights[:, :, :, :len(engs[-1].split()) + 1].cpu(),
-    xlabel='Key posistions', ylabel='Query posistions')
+    xlabel='Key positions', ylabel='Query positions')
+```
+
+```{.python .input}
+#@tab tensorflow
+# 加上一个包含序列结束词元
+d2l.show_heatmaps(attention_weights[:, :, :, :len(engs[-1].split()) + 1],
+                  xlabel='Key posistions', ylabel='Query posistions')
 ```
 
 ## 小结

diff --git a/chapter_attention-mechanisms/index.md b/chapter_attention-mechanisms/index.md
@@ -7,7 +7,7 @@
 
 然后，我们继续介绍的是注意力函数，它们在深度学习的注意力模型设计中被广泛使用。具体来说，我们将展示如何使用这些函数来设计*Bahdanau注意力*。Bahdanau注意力是深度学习中的具有突破性价值的注意力模型，它是双向对齐的并且可以微分。
 
-最后，我们将描述仅仅基于注意力机制的*Transformer*结构，该结构中使用了*多头注意力*（multi-head attention）和*自注意力*（self-attention）设计。自2017年被构想出来，Transformer一直都普遍存在于现代的深度学习应用中，例如语言、视觉、语音和强化学习领域。
+最后，我们将描述仅仅基于注意力机制的*transformer*结构，该结构中使用了*多头注意力*（multi-head attention）和*自注意力*（self-attention）设计。自2017年被构想出来，transformer一直都普遍存在于现代的深度学习应用中，例如语言、视觉、语音和强化学习领域。
 
 ```toc
 :maxdepth: 2