Merge pull request d2l-ai#1198 from d2l-ai/paddle

Add PaddlePaddle Implementation
wangtaogithub · Nov 13, 2022 · 828d176 · 828d176
2 parents 19a6f1f + d09967b
commit 828d176
Show file tree

Hide file tree

Showing 114 changed files with 12,144 additions and 1,039 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -51,6 +51,13 @@ stage("Build and Publish") {
       ./static/cache.sh store _build/eval_tensorflow/data
       """
 
+      sh label: "Execute Notebooks [Paddlepaddle]", script: """set -ex
+      conda activate ${ENV_NAME}
+      ./static/cache.sh restore _build/eval_paddlepaddle/data
+      d2lbook build eval --tab paddle
+      ./static/cache.sh store _build/eval_paddlepaddle/data
+      """
+
       sh label:"Build HTML", script:"""set -ex
       conda activate ${ENV_NAME}
       ./static/build_html.sh

diff --git a/chapter_appendix-tools-for-deep-learning/contributing.md b/chapter_appendix-tools-for-deep-learning/contributing.md
@@ -45,6 +45,7 @@
 请使用`#@tab`来标记代码块的起始行。
 例如`#@tab pytorch`用于一个PyTorch代码块，
 `#@tab tensorflow`用于一个TensorFlow代码块，
+`#@tab paddle`用于一个PaddlePaddle代码块，
 或者`#@tab all`是所有实现的共享代码块。
 你可以参考[d2lbook](http://book.d2l.ai/user/code_tabs.html)包了解更多信息。
 

diff --git a/chapter_appendix-tools-for-deep-learning/d2l.md b/chapter_appendix-tools-for-deep-learning/d2l.md
@@ -22,6 +22,12 @@
 ```
 :end_tab:
 
+:begin_tab:`paddle`
+```eval_rst
+.. currentmodule:: d2l.paddle
+```
+:end_tab:
+
 ## 模型
 
 ```eval_rst

diff --git a/chapter_appendix-tools-for-deep-learning/sagemaker.md b/chapter_appendix-tools-for-deep-learning/sagemaker.md
@@ -79,27 +79,36 @@ SageMaker提供多个具有不同计算能力和价格的[实例类型](https://
 你可能希望在从远程存储库提取更新之前提交本地更改。否则，只需在终端中使用以下命令放弃所有本地更改：
 
 :begin_tab:`mxnet`
+
 ```bash
 cd SageMaker/d2l-en-sagemaker/
 git reset --hard
 git pull
 ```
+
+
 :end_tab:
 
 :begin_tab:`pytorch`
+
 ```bash
 cd SageMaker/d2l-pytorch-sagemaker/
 git reset --hard
 git pull
 ```
+
+
 :end_tab:
 
 :begin_tab:`tensorflow`
+
 ```bash
 cd SageMaker/d2l-tensorflow-sagemaker/
 git reset --hard
 git pull
 ```
+
+
 :end_tab:
 
 ## 小结

diff --git a/chapter_attention-mechanisms/attention-cues.md b/chapter_attention-mechanisms/attention-cues.md
@@ -116,6 +116,14 @@ from d2l import tensorflow as d2l
 import tensorflow as tf
 ```
 
+```{.python .input}
+#@tab paddle
+from d2l import paddle as d2l
+import warnings
+warnings.filterwarnings("ignore")
+import paddle
+```
+
 为了可视化注意力权重，需要定义一个`show_heatmaps`函数。
 其输入`matrices`的形状是
 （要显示的行数，要显示的列数，查询的数目，键的数目）。
@@ -178,5 +186,3 @@ show_heatmaps(attention_weights, xlabel='Keys', ylabel='Queries')
 :begin_tab:`tensorflow`
 [Discussions](https://discuss.d2l.ai/t/5765)
 :end_tab:
-
-
diff --git a/chapter_attention-mechanisms/attention-scoring-functions.md b/chapter_attention-mechanisms/attention-scoring-functions.md
@@ -62,6 +62,15 @@ from d2l import tensorflow as d2l
 import tensorflow as tf
 ```
 
+```{.python .input}
+#@tab paddle
+from d2l import paddle as d2l
+import math
+import warnings
+warnings.filterwarnings("ignore")
+import paddle
+from paddle import nn
+```
 
 ## [**掩蔽softmax操作**]
 
@@ -136,6 +145,26 @@ def masked_softmax(X, valid_lens):
         return tf.nn.softmax(tf.reshape(X, shape=shape), axis=-1)
 ```
 
+```{.python .input}
+#@tab paddle
+#@save
+def masked_softmax(X, valid_lens):
+    """通过在最后一个轴上掩蔽元素来执行softmax操作"""
+    # X:3D张量，valid_lens:1D或2D张量
+    if valid_lens is None:
+        return nn.functional.softmax(X, axis=-1)
+    else:
+        shape = X.shape
+        if valid_lens.dim() == 1:
+            valid_lens = paddle.repeat_interleave(valid_lens, shape[1])
+        else:
+            valid_lens = valid_lens.reshape((-1,))
+        # 最后一轴上被掩蔽的元素使用一个非常大的负值替换，从而其softmax输出为0
+        X = d2l.sequence_mask(X.reshape((-1, shape[-1])), valid_lens,
+                              value=-1e6)
+        return nn.functional.softmax(X.reshape(shape), axis=-1)
+```
+
 为了[**演示此函数是如何工作**]的，
 考虑由两个$2 \times 4$矩阵表示的样本，
 这两个样本的有效长度分别为$2$和$3$。
@@ -155,6 +184,11 @@ masked_softmax(torch.rand(2, 2, 4), torch.tensor([2, 3]))
 masked_softmax(tf.random.uniform(shape=(2, 2, 4)), tf.constant([2, 3]))
 ```
 
+```{.python .input}
+#@tab paddle
+masked_softmax(paddle.rand((2, 2, 4)), paddle.to_tensor([2, 3]))
+```
+
 同样，也可以使用二维张量，为矩阵样本中的每一行指定有效长度。
 
 ```{.python .input}
@@ -172,6 +206,11 @@ masked_softmax(torch.rand(2, 2, 4), d2l.tensor([[1, 3], [2, 4]]))
 masked_softmax(tf.random.uniform(shape=(2, 2, 4)), tf.constant([[1, 3], [2, 4]]))
 ```
 
+```{.python .input}
+#@tab paddle
+masked_softmax(paddle.rand((2, 2, 4)), paddle.to_tensor([[1, 3], [2, 4]]))
+```
+
 ## [**加性注意力**]
 :label:`subsec_additive-attention`
 
@@ -280,6 +319,34 @@ class AdditiveAttention(tf.keras.layers.Layer):
             self.attention_weights, **kwargs), values)
 ```
 
+```{.python .input}
+#@tab paddle
+#@save
+class AdditiveAttention(nn.Layer):
+    """加性注意力"""
+    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
+        super(AdditiveAttention, self).__init__(**kwargs)
+        self.W_k = nn.Linear(key_size, num_hiddens, bias_attr=False)
+        self.W_q = nn.Linear(query_size, num_hiddens, bias_attr=False)
+        self.w_v = nn.Linear(num_hiddens, 1, bias_attr=False)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, queries, keys, values, valid_lens):
+        queries, keys = self.W_q(queries), self.W_k(keys)
+        # 在维度扩展后，
+        # queries的形状：(batch_size，查询的个数，1，num_hidden)
+        # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
+        # 使用广播方式进行求和
+        features = queries.unsqueeze(2) + keys.unsqueeze(1)
+        features = paddle.tanh(features)
+        # self.w_v仅有一个输出，因此从形状中移除最后那个维度。
+        # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
+        scores = self.w_v(features).squeeze(-1)
+        self.attention_weights = masked_softmax(scores, valid_lens)
+        # values的形状：(batch_size，“键－值”对的个数，值的维度)
+        return paddle.bmm(self.dropout(self.attention_weights), values)
+```
+
 用一个小例子来[**演示上面的`AdditiveAttention`类**]，
 其中查询、键和值的形状为（批量大小，步数或词元序列长度，特征大小），
 实际输出为$(2,1,20)$、$(2,10,2)$和$(2,10,4)$。
@@ -323,6 +390,19 @@ attention = AdditiveAttention(key_size=2, query_size=20, num_hiddens=8,
 attention(queries, keys, values, valid_lens, training=False)
 ```
 
+```{.python .input}
+#@tab paddle
+queries, keys = paddle.normal(0, 1, (2, 1, 20)), paddle.ones((2, 10, 2))
+# values的小批量，两个值矩阵是相同的
+values = paddle.arange(40, dtype=paddle.float32).reshape((1, 10, 4)).tile(
+    [2, 1, 1])
+valid_lens = paddle.to_tensor([2, 6])
+
+attention = AdditiveAttention(key_size=2, query_size=20, num_hiddens=8,
+                              dropout=0.1)
+attention.eval()
+attention(queries, keys, values, valid_lens)
+```
 
 尽管加性注意力包含了可学习的参数，但由于本例子中每个键都是相同的，
 所以[**注意力权重**]是均匀的，由指定的有效长度决定。
@@ -421,6 +501,27 @@ class DotProductAttention(tf.keras.layers.Layer):
         return tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
 ```
 
+```{.python .input}
+#@tab paddle
+#@save
+class DotProductAttention(nn.Layer):
+    """缩放点积注意力"""
+    def __init__(self, dropout, **kwargs):
+        super(DotProductAttention, self).__init__(**kwargs)
+        self.dropout = nn.Dropout(dropout)
+
+    # queries的形状：(batch_size，查询的个数，d)
+    # keys的形状：(batch_size，“键－值”对的个数，d)
+    # values的形状：(batch_size，“键－值”对的个数，值的维度)
+    # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
+    def forward(self, queries, keys, values, valid_lens=None):
+        d = queries.shape[-1]
+        # 设置transpose_b=True为了交换keys的最后两个维度
+        scores = paddle.bmm(queries, keys.transpose((0,2,1))) / math.sqrt(d)
+        self.attention_weights = masked_softmax(scores, valid_lens)
+        return paddle.bmm(self.dropout(self.attention_weights), values)
+```
+
 为了[**演示上述的`DotProductAttention`类**]，
 我们使用与先前加性注意力例子中相同的键、值和有效长度。
 对于点积操作，我们令查询的特征维度与键的特征维度大小相同。
@@ -447,6 +548,14 @@ attention = DotProductAttention(dropout=0.5)
 attention(queries, keys, values, valid_lens, training=False)
 ```
 
+```{.python .input}
+#@tab paddle
+queries = paddle.normal(0, 1, (2, 1, 2))
+attention = DotProductAttention(dropout=0.5)
+attention.eval()
+attention(queries, keys, values, valid_lens)
+```
+
 与加性注意力演示相同，由于键包含的是相同的元素，
 而这些元素无法通过任何查询进行区分，因此获得了[**均匀的注意力权重**]。
 

diff --git a/chapter_attention-mechanisms/bahdanau-attention.md b/chapter_attention-mechanisms/bahdanau-attention.md
@@ -63,6 +63,15 @@ from d2l import tensorflow as d2l
 import tensorflow as tf
 ```
 
+```{.python .input}
+#@tab paddle
+from d2l import paddle as d2l
+import warnings
+warnings.filterwarnings("ignore")
+import paddle
+from paddle import nn
+```
+
 ## 定义注意力解码器
 
 下面看看如何定义Bahdanau注意力，实现循环神经网络编码器-解码器。
@@ -244,6 +253,56 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
         return self._attention_weights
 ```
 
+```{.python .input}
+#@tab paddle
+class Seq2SeqAttentionDecoder(AttentionDecoder):
+    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
+                 dropout=0, **kwargs):
+        super(Seq2SeqAttentionDecoder, self).__init__(**kwargs)
+        self.attention = d2l.AdditiveAttention(
+            num_hiddens, num_hiddens, num_hiddens, dropout)
+        self.embedding = nn.Embedding(vocab_size, embed_size)
+        self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, 
+                          num_layers, bias_ih_attr=True,
+                          time_major=True, dropout=dropout)
+        self.dense = nn.Linear(num_hiddens, vocab_size)
+
+    def init_state(self, enc_outputs, enc_valid_lens, *args):
+        # outputs的形状为(batch_size，num_steps，num_hiddens).
+        # hidden_state的形状为(num_layers，batch_size，num_hiddens)
+        outputs, hidden_state = enc_outputs
+        return (outputs.transpose((1, 0, 2)), hidden_state, enc_valid_lens)
+
+    def forward(self, X, state):
+        # enc_outputs的形状为(batch_size,num_steps,num_hiddens).
+        # hidden_state的形状为(num_layers,batch_size,num_hiddens)
+        enc_outputs, hidden_state, enc_valid_lens = state
+        # 输出X的形状为(num_steps,batch_size,embed_size)
+        X = self.embedding(X).transpose((1, 0, 2))
+        outputs, self._attention_weights = [], []
+        for x in X:
+            # query的形状为(batch_size,1,num_hiddens)
+            query = paddle.unsqueeze(hidden_state[-1], axis=1)
+            # context的形状为(batch_size,1,num_hiddens)
+            context = self.attention(
+                query, enc_outputs, enc_outputs, enc_valid_lens)
+            # 在特征维度上连结
+            x = paddle.concat((context, paddle.unsqueeze(x, axis=1)), axis=-1)
+            # 将x变形为(1,batch_size,embed_size+num_hiddens)
+            out, hidden_state = self.rnn(x.transpose((1, 0, 2)), hidden_state)
+            outputs.append(out)
+            self._attention_weights.append(self.attention.attention_weights)
+        # 全连接层变换后，outputs的形状为
+        # (num_steps,batch_size,vocab_size)
+        outputs = self.dense(paddle.concat(outputs, axis=0))
+        return outputs.transpose((1, 0, 2)), [enc_outputs, hidden_state, 
+                                              enc_valid_lens]
+
+    @property
+    def attention_weights(self):
+        return self._attention_weights
+```
+
 接下来，使用包含7个时间步的4个序列输入的小批量[**测试Bahdanau注意力解码器**]。
 
 ```{.python .input}
@@ -285,6 +344,20 @@ output, state = decoder(X, state, training=False)
 output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
 ```
 
+```{.python .input}
+#@tab paddle
+encoder = d2l.Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
+                             num_layers=2)
+encoder.eval()
+decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
+                                  num_layers=2)
+decoder.eval()
+X = paddle.zeros((4, 7), dtype='int64')  # (batch_size,num_steps)
+state = decoder.init_state(encoder(X), None)
+output, state = decoder(X, state)
+output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
+```
+
 ## [**训练**]
 
 与 :numref:`sec_seq2seq_training`类似，
@@ -311,7 +384,7 @@ d2l.train_seq2seq(net, train_iter, lr, num_epochs, tgt_vocab, device)
 模型训练后，我们用它[**将几个英语句子翻译成法语**]并计算它们的BLEU分数。
 
 ```{.python .input}
-#@tab mxnet, pytorch
+#@tab mxnet, pytorch, paddle
 engs = ['go .', "i lost .", 'he\'s calm .', 'i\'m home .']
 fras = ['va !', 'j\'ai perdu .', 'il est calme .', 'je suis chez moi .']
 for eng, fra in zip(engs, fras):
@@ -351,7 +424,7 @@ d2l.show_heatmaps(
 ```
 
 ```{.python .input}
-#@tab pytorch
+#@tab pytorch, paddle
 # 加上一个包含序列结束词元
 d2l.show_heatmaps(
     attention_weights[:, :, :, :len(engs[-1].split()) + 1].cpu(),