v0.16.1 (d2l-ai#645)

* v0.16.1 * build d2l lib Co-authored-by: Ubuntu <ubuntu@ip-172-31-3-26.us-west-2.compute.internal>
marble234 · Feb 17, 2021 · 237f802 · 237f802
1 parent 5dcc556
commit 237f802
Show file tree

Hide file tree

Showing 30 changed files with 288 additions and 189 deletions.
diff --git a/chapter_convolutional-modern/batch-norm.md b/chapter_convolutional-modern/batch-norm.md
@@ -202,9 +202,9 @@ class BatchNorm(nn.Block):
         # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成1和0
         self.gamma = self.params.get('gamma', shape=shape, init=init.One())
         self.beta = self.params.get('beta', shape=shape, init=init.Zero())
-        # 不参与求梯度和迭代的变量，全在内存上初始化成0
+        # 非模型参数的变量初始化为0和1
         self.moving_mean = np.zeros(shape)
-        self.moving_var = np.zeros(shape)
+        self.moving_var = np.ones(shape)
 
     def forward(self, X):
         # 如果 `X` 不在内存上，将 `moving_mean` 和 `moving_var`
@@ -233,9 +233,9 @@ class BatchNorm(nn.Module):
         # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成1和0
         self.gamma = nn.Parameter(torch.ones(shape))
         self.beta = nn.Parameter(torch.zeros(shape))
-        # 不参与求梯度和迭代的变量，全在内存上初始化成0
+        # 非模型参数的变量初始化为0和1
         self.moving_mean = torch.zeros(shape)
-        self.moving_var = torch.zeros(shape)
+        self.moving_var = torch.ones(shape)
 
     def forward(self, X):
         # 如果 `X` 不在内存上，将 `moving_mean` 和 `moving_var`
@@ -263,12 +263,12 @@ class BatchNorm(tf.keras.layers.Layer):
             initializer=tf.initializers.ones, trainable=True)
         self.beta = self.add_weight(name='beta', shape=weight_shape,
             initializer=tf.initializers.zeros, trainable=True)
-        # 不参与求梯度和迭代的变量，全在内存上初始化成0
+        # 非模型参数的变量初始化为0和1
         self.moving_mean = self.add_weight(name='moving_mean',
             shape=weight_shape, initializer=tf.initializers.zeros,
             trainable=False)
         self.moving_variance = self.add_weight(name='moving_variance',
-            shape=weight_shape, initializer=tf.initializers.zeros,
+            shape=weight_shape, initializer=tf.initializers.ones,
             trainable=False)
         super(BatchNorm, self).build(input_shape)
 

diff --git a/chapter_convolutional-modern/densenet.md b/chapter_convolutional-modern/densenet.md
@@ -283,7 +283,7 @@ for i, num_convs in enumerate(num_convs_in_dense_blocks):
     net.add(DenseBlock(num_convs, growth_rate))
     # 上一个稠密块的输出通道数
     num_channels += num_convs * growth_rate
-    # 在稠密块之间加入通道数减半的过渡层
+    # 在稠密块之间添加一个转换层，使通道数量减半
     if i != len(num_convs_in_dense_blocks) - 1:
         num_channels //= 2
         net.add(transition_block(num_channels))
@@ -299,7 +299,7 @@ for i, num_convs in enumerate(num_convs_in_dense_blocks):
     blks.append(DenseBlock(num_convs, num_channels, growth_rate))
     # 上一个稠密块的输出通道数
     num_channels += num_convs * growth_rate
-    # 在稠密块之间加入通道数减半的过渡层
+    # 在稠密块之间添加一个转换层，使通道数量减半
     if i != len(num_convs_in_dense_blocks) - 1:
         blks.append(transition_block(num_channels, num_channels // 2))
         num_channels = num_channels // 2
@@ -317,7 +317,7 @@ def block_2():
         net.add(DenseBlock(num_convs, growth_rate))
         # 上一个稠密块的输出通道数
         num_channels += num_convs * growth_rate
-        # 在稠密块之间加入通道数减半的过渡层
+        # 在稠密块之间添加一个转换层，使通道数量减半
         if i != len(num_convs_in_dense_blocks) - 1:
             num_channels //= 2
             net.add(TransitionBlock(num_channels))

diff --git a/chapter_convolutional-modern/googlenet.md b/chapter_convolutional-modern/googlenet.md
@@ -269,7 +269,7 @@ net.add(b1, b2, b3, b4, b5, nn.Dense(10))
 #@tab pytorch
 b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
                    Inception(832, 384, (192, 384), (48, 128), 128),
-                   nn.AdaptiveMaxPool2d((1,1)),
+                   nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten())
 
 net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))

diff --git a/chapter_convolutional-modern/vgg.md b/chapter_convolutional-modern/vgg.md
@@ -15,7 +15,17 @@
 1. 非线性激活函数，如ReLU；
 1. 池化层，如最大池化层。
 
-而一个 VGG 块与之类似，由一系列卷积层组成，后面再加上用于空间下采样的最大池化层。在最初的 VGG 论文 :cite:`Simonyan.Zisserman.2014` 中，作者使用了带有 $3\times3$ 卷积核、填充为 1（保持高度和宽度）的卷积层，和带有 $2 \times 2$ 池化窗口、步幅为 2（每个块后的分辨率减半）的最大池化层。在下面的代码中，我们定义了一个名为 `vgg_block` 的函数来实现一个 VGG 块。该函数有两个参数，分别对应于卷积层的数量 `num_convs` 和输出通道的数量 `num_channels`.
+而一个 VGG 块与之类似，由一系列卷积层组成，后面再加上用于空间下采样的最大池化层。在最初的 VGG 论文 :cite:`Simonyan.Zisserman.2014` 中，作者使用了带有 $3\times3$ 卷积核、填充为 1（保持高度和宽度）的卷积层，和带有 $2 \times 2$ 池化窗口、步幅为 2（每个块后的分辨率减半）的最大池化层。在下面的代码中，我们定义了一个名为 `vgg_block` 的函数来实现一个 VGG 块。
+
+:begin_tab:`mxnet,tensorflow`
+该函数有两个参数，分别对应于卷积层的数量 `num_convs` 和输出通道的数量 `num_channels`.
+:end_tab:
+
+:begin_tab:`pytorch`
+该函数有三个参数，分别对应于卷积层的数量 `num_convs`、输入通道的数量 `in_channels`
+和输出通道的数量  `out_channels`.
+:end_tab:
+
 
 ```{.python .input}
 from d2l import mxnet as d2l
@@ -39,7 +49,7 @@ import torch
 from torch import nn
 
 def vgg_block(num_convs, in_channels, out_channels):
-    layers=[]
+    layers = []
     for _ in range(num_convs):
         layers.append(nn.Conv2d(in_channels, out_channels,
                                 kernel_size=3, padding=1))
@@ -103,9 +113,9 @@ net = vgg(conv_arch)
 ```{.python .input}
 #@tab pytorch
 def vgg(conv_arch):
+    conv_blks = []
+    in_channels = 1
     # 卷积层部分
-    conv_blks=[]
-    in_channels=1
     for (num_convs, out_channels) in conv_arch:
         conv_blks.append(vgg_block(num_convs, in_channels, out_channels))
         in_channels = out_channels

diff --git a/chapter_convolutional-neural-networks/lenet.md b/chapter_convolutional-neural-networks/lenet.md
@@ -78,7 +78,6 @@ net = torch.nn.Sequential(
 #@tab tensorflow
 from d2l import tensorflow as d2l
 import tensorflow as tf
-from tensorflow.distribute import MirroredStrategy, OneDeviceStrategy
 
 def net():
     return tf.keras.models.Sequential([
@@ -111,7 +110,7 @@ for layer in net:
 
 ```{.python .input}
 #@tab pytorch
-X = torch.randn(size=(1, 1, 28, 28), dtype=torch.float32)
+X = torch.rand(size=(1, 1, 28, 28), dtype=torch.float32)
 for layer in net:
     X = layer(X)
     print(layer.__class__.__name__,'output shape: \t',X.shape)
@@ -165,13 +164,20 @@ def evaluate_accuracy_gpu(net, data_iter, device=None):  #@save
 ```{.python .input}
 #@tab pytorch
 def evaluate_accuracy_gpu(net, data_iter, device=None): #@save
-    """Compute the accuracy for a model on a dataset using a GPU."""
-    net.eval()  # 设置为评估模式
-    if not device:
-        device = next(iter(net.parameters())).device
-    metric = d2l.Accumulator(2)  # 正确预测的数量，总预测的数量
+    """使用GPU计算模型在数据集上的精度。"""
+    if isinstance(net, torch.nn.Module):
+        net.eval()  # 设置为评估模式
+        if not device:
+            device = next(iter(net.parameters())).device
+    # 正确预测的数量，总预测的数量
+    metric = d2l.Accumulator(2)
     for X, y in data_iter:
-        X, y = X.to(device), y.to(device)
+        if isinstance(X, list):
+            # BERT微调所需的（之后将介绍）
+            X = [x.to(device) for x in X]
+        else:
+            X = X.to(device)
+        y = y.to(device)
         metric.add(d2l.accuracy(net(X), y), d2l.size(y))
     return metric[0] / metric[1]
 ```
@@ -229,7 +235,7 @@ def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
     """Train a model with a GPU (defined in Chapter 6)."""
     def init_weights(m):
         if type(m) == nn.Linear or type(m) == nn.Conv2d:
-            torch.nn.init.xavier_uniform_(m.weight)
+            nn.init.xavier_uniform_(m.weight)
     net.apply(init_weights)
     print('training on', device)
     net.to(device)
@@ -239,10 +245,11 @@ def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
                             legend=['train loss', 'train acc', 'test acc'])
     timer, num_batches = d2l.Timer(), len(train_iter)
     for epoch in range(num_epochs):
-        metric = d2l.Accumulator(3)  # 训练损失之和，训练准确率之和，范例数
+        # 训练损失之和，训练准确率之和，范例数
+        metric = d2l.Accumulator(3)  
+        net.train()
         for i, (X, y) in enumerate(train_iter):
             timer.start()
-            net.train()
             optimizer.zero_grad()
             X, y = X.to(device), y.to(device)
             y_hat = net(X)
@@ -252,8 +259,8 @@ def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
             with torch.no_grad():
                 metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
             timer.stop()
-            train_l = metric[0]/metric[2]
-            train_acc = metric[1]/metric[2]
+            train_l = metric[0] / metric[2]
+            train_acc = metric[1] / metric[2]
             if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                 animator.add(epoch + (i + 1) / num_batches,
                              (train_l, train_acc, None))

diff --git a/chapter_convolutional-neural-networks/why-conv.md b/chapter_convolutional-neural-networks/why-conv.md
@@ -5,7 +5,8 @@
 
 有时我们缺乏足够的知识来指导更巧妙的模型结构设计，此时多层感知机可能是最好的选择。然而，对于高维感知数据，这种无结构网络可能会变得笨拙。
 
-例如，在之前区分猫和狗的例子中。假设我们收集了一个照片数据集，每张照片具有百万级像素，这意味着多层感知机的每次输入都有一百万个维度。然而即使将隐藏层维度降低到 $1000$ ，这个神经网络也将有 $10^6 \times 10^3 = 10^9$ 个参数。想要训练这个模型很难，需要有大量的GPU、分布式优化训练的经验和超乎常人的耐心。
+例如，在之前区分猫和狗的例子中。假设我们收集了一个照片数据集，每张照片具有百万级像素，这意味着多层感知机的每次输入都有一百万个维度。
+根据我们在:numref:`subsec_parameterization-cost-fc-layers`中对全连接层参数开销的讨论。即使将隐藏层维度降低到 $1000$ ，这个神经网络也将有 $10^6 \times 10^3 = 10^9$ 个参数。想要训练这个模型很难，需要有大量的GPU、分布式优化训练的经验和超乎常人的耐心。
 
 
 细心的读者可能会反对这一论点，认为要求百万像素的分辨率可能不是必要的。

diff --git a/chapter_deep-learning-computation/custom-layer.md b/chapter_deep-learning-computation/custom-layer.md
@@ -7,7 +7,7 @@
 首先，我们构造一个没有任何参数的自定义层。如果你还记得我们在 :numref:`sec_model_construction` 对块的介绍，这应该看起来很眼熟。下面的`CenteredLayer`类要从其输入中减去均值。要构建它，我们只需继承基础层类并实现正向传播功能。
 
 ```{.python .input}
-from mxnet import gluon, np, npx
+from mxnet import np, npx
 from mxnet.gluon import nn
 npx.set_np()
 

diff --git a/chapter_deep-learning-computation/deferred-init.md b/chapter_deep-learning-computation/deferred-init.md
@@ -16,7 +16,7 @@
 首先，让我们实例化一个多层感知机。
 
 ```{.python .input}
-from mxnet import init, np, npx
+from mxnet import np, npx
 from mxnet.gluon import nn
 npx.set_np()
 

diff --git a/chapter_deep-learning-computation/model-construction.md b/chapter_deep-learning-computation/model-construction.md
@@ -60,7 +60,7 @@ net(X)
 :end_tab:
 
 :begin_tab:`pytorch`
-在这个例子中，我们通过实例化`nn.Sequential`来构建我们的模型，层的执行顺序是作为参数传递的。简而言之，`nn.Sequential`定义了一种特殊的`Module`，即在PyTorch中表示一个块的类。它维护了一个由`Module`组成的有序列表，注意，两个全连接层都是`Linear`类的实例，`Linear`类本身就是`Module`的子类。正向传播（`forward`）函数也非常简单：它将列表中的每个块连接在一起，将每个块的输出作为下一个块的输入。注意，到目前为止，我们一直在通过`net(X)`调用我们的模型来获得模型的输出。这实际上是`net.forward(X)`的简写，这是通过`Block`类的`__call__`函数实现的一个Python技巧。
+在这个例子中，我们通过实例化`nn.Sequential`来构建我们的模型，层的执行顺序是作为参数传递的。简而言之，`nn.Sequential`定义了一种特殊的`Module`，即在PyTorch中表示一个块的类。它维护了一个由`Module`组成的有序列表，注意，两个全连接层都是`Linear`类的实例，`Linear`类本身就是`Module`的子类。正向传播（`forward`）函数也非常简单：它将列表中的每个块连接在一起，将每个块的输出作为下一个块的输入。注意，到目前为止，我们一直在通过`net(X)`调用我们的模型来获得模型的输出。这实际上是`net.__call__(X)`的简写。
 :end_tab:
 
 :begin_tab:`tensorflow`

diff --git a/chapter_deep-learning-computation/parameters.md b/chapter_deep-learning-computation/parameters.md
@@ -37,7 +37,6 @@ net(X)
 ```{.python .input}
 #@tab tensorflow
 import tensorflow as tf
-import numpy as np
 
 net = tf.keras.models.Sequential([
     tf.keras.layers.Flatten(),
@@ -336,10 +335,10 @@ print(net[1].weight.data())
 #@tab pytorch
 def xavier(m):
     if type(m) == nn.Linear:
-        torch.nn.init.xavier_uniform_(m.weight)
+        nn.init.xavier_uniform_(m.weight)
 def init_42(m):
     if type(m) == nn.Linear:
-        torch.nn.init.constant_(m.weight, 42)
+        nn.init.constant_(m.weight, 42)
 
 net[0].apply(xavier)
 net[2].apply(init_42)
@@ -418,7 +417,10 @@ net[0].weight[:2]
 #@tab tensorflow
 class MyInit(tf.keras.initializers.Initializer):
     def __call__(self, shape, dtype=None):
-        return tf.random.uniform(shape, dtype=dtype)
+        data=tf.random.uniform(shape, -10, 10, dtype=dtype)
+        factor=(tf.abs(data) >= 5)
+        factor=tf.cast(factor, tf.float32)
+        return data * factor
 
 net = tf.keras.models.Sequential([
     tf.keras.layers.Flatten(),

diff --git a/chapter_installation/index.md b/chapter_installation/index.md
@@ -25,10 +25,10 @@ sh Miniconda3-latest-Linux-x86_64.sh -b
 ~/miniconda3/bin/conda init
 ```
 
-现在关闭并重新打开当前的 shell。您应该能够创建一个新的环境，如下所示：
+现在关闭并重新打开当前的 shell。你应该能够创建一个新的环境，如下所示：
 
 ```bash
-conda create --name d2l -y
+conda create --name d2l python=3.8 -y
 ```
 
 ## 下载 D2L Notebook
@@ -43,39 +43,34 @@ unzip d2l-zh.zip && rm d2l-zh.zip
 
 注意：如果没有安装 `unzip`，则可以通过运行 `sudo apt install unzip` 进行安装。
 
-现在我们要激活 `d2l` 环境并安装 `pip`。在此命令后面的查询中输入 `y`。
+现在我们要激活 `d2l` 环境。
 
 ```bash
 conda activate d2l
-conda install python=3.7 pip -y
 ```
 
 ## 安装框架和 `d2l` 软件包
 
-:begin_tab:`mxnet,pytorch`
-在安装深度学习框架之前，请先检查您的计算机上是否有正确的 GPU（在标准笔记本电脑上为显示器提供电源的 GPU 不计入我们的目的）。如果要在 GPU 服务器上安装，请继续执行 :ref:`subsec_gpu` 以获取有关安装 GPU 支持版本的说明。
+在安装深度学习框架之前，请先检查你的计算机上是否有正确的 GPU（在标准笔记本电脑上为显示器提供电源的 GPU 不计入我们的目的）。如果要在 GPU 服务器上安装，请继续执行 :ref:`subsec_gpu` 以获取有关安装 GPU 支持版本的说明。
 
-否则，您可以安装 CPU 版本。这将是足够的马力来帮助您完成前几章，但您需要在运行更大的模型之前访问 GPU。
-:end_tab:
+或者，你可以按照如下方法安装CPU版本。这将足够帮助你完成前几章，但你需要在运行更大的模型之前访问 GPU。
 
 :begin_tab:`mxnet`
-```bash
-# For Windows users
-pip install mxnet==1.7.0 -f https://dist.mxnet.io/python
 
-# For Linux and macOS users
-pip install mxnet==1.7.0
+```bash
+pip install mxnet==1.7.0.post1
 ```
 :end_tab:
 
 :begin_tab:`pytorch`
+
 ```bash
 pip install torch torchvision -f https://download.pytorch.org/whl/torch_stable.html
 ```
 :end_tab:
 
 :begin_tab:`tensorflow`
-您可以通过以下方式安装具有 CPU 和 GPU 支持的 TensorFlow：
+你可以通过以下方式安装具有 CPU 和 GPU 支持的 TensorFlow：
 
 ```bash
 pip install tensorflow tensorflow-probability
@@ -85,6 +80,7 @@ pip install tensorflow tensorflow-probability
 我们还安装了 `d2l` 软件包，它封装了本书中常用的函数和类。
 
 ```bash
+# -U：将所有包升级到最新的可用版本
 pip install -U d2l
 ```
 
@@ -94,53 +90,37 @@ pip install -U d2l
 jupyter notebook
 ```
 
-此时，您可以在 Web 浏览器中打开 <http://localhost:8888>（通常会自动打开）。然后我们可以运行这本书的每个部分的代码。在运行书籍代码或更新深度学习框架或 `d2l` 软件包之前，请始终执行 `conda activate d2l` 以激活运行时环境。要退出环境，请运行 `conda deactivate`。
+此时，你可以在 Web 浏览器中打开 <http://localhost:8888>（通常会自动打开）。然后我们可以运行这本书的每个部分的代码。在运行书籍代码或更新深度学习框架或 `d2l` 软件包之前，请始终执行 `conda activate d2l` 以激活运行时环境。要退出环境，请运行 `conda deactivate`。
 
 ## GPU 支持
 :label:`subsec_gpu`
 
-:begin_tab:`mxnet,pytorch`
-默认情况下，安装深度学习框架时不支持 GPU，以确保它在任何计算机（包括大多数笔记本电脑）上运行。本书的一部分要求或建议使用 GPU 运行。如果您的计算机具有 NVIDIA 显卡并且已安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，则应安装启用 GPU 的版本。如果您已经安装了仅 CPU 版本，则可能需要首先通过运行以下操作将其删除：
-:end_tab:
-
-:begin_tab:`tensorflow`
-默认情况下，TensorFlow 安装了 GPU 支持。如果您的计算机具有 NVIDIA 显卡并且已安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，那么您都可以完成。
-:end_tab:
-
 :begin_tab:`mxnet`
+默认情况下，安装MXNet时不支持 GPU，以确保它在任何计算机（包括大多数笔记本电脑）上运行。本书的一部分要求或建议使用 GPU 运行。如果你的计算机具有 NVIDIA 显卡并且已安装 [CUDA](https://developer.nvidia.com/cuda-downloads)，则应安装启用 GPU 的版本。如果你已经安装了仅 CPU 版本，则可能需要首先通过运行以下操作将其删除：
+
 ```bash
 pip uninstall mxnet
 ```
-:end_tab:
 
-:begin_tab:`pytorch`
-```bash
-pip uninstall torch
-```
-:end_tab:
 
-:begin_tab:`mxnet,pytorch`
-然后，我们需要找到您安装的 CUDA 版本。你可以通过 `nvcc --version` 或 `cat /usr/local/cuda/version.txt` 查看它。假定您已安装 CUDA 10.1，则可以使用以下命令进行安装：
-:end_tab:
+然后，我们需要找到你安装的 CUDA 版本。你可以通过 `nvcc --version` 或 `cat /usr/local/cuda/version.txt` 查看它。假定你已安装 CUDA 10.1，则可以使用以下命令进行安装：
+
 
-:begin_tab:`mxnet`
 ```bash
 # For Windows users
 pip install mxnet-cu101==1.7.0 -f https://dist.mxnet.io/python
 
 # For Linux and macOS users
 pip install mxnet-cu101==1.7.0
 ```
-:end_tab:
 
-:begin_tab:`pytorch`
-```bash
-pip install torch==1.5.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-```
+
+你可以根据你的 CUDA 版本更改最后一位数字，例如：CUDA 10.0 的 `cu100` 和 CUDA 9.0 的 `cu90`。
 :end_tab:
 
-:begin_tab:`mxnet,pytorch`
-您可以根据您的 CUDA 版本更改最后一位数字，例如：CUDA 10.0 的 `cu100` 和 CUDA 9.0 的 `cu90`。
+:begin_tab:`pytorch,tensorflow`
+默认情况下，深度学习框架安装了GPU支持。
+如果你的计算机有NVIDIA GPU，并且已经安装了[CUDA](https://developer.nvidia.com/cuda-downloads)，那么你应该已经设置好了。
 :end_tab:
 
 ## 练习