Skip to content

Commit

Permalink
remove space in comments (d2l-ai#1035)
Browse files Browse the repository at this point in the history
Co-authored-by: Ubuntu <ubuntu@ip-172-31-12-66.us-west-2.compute.internal>
  • Loading branch information
xiaotinghe and Ubuntu committed Dec 6, 2021
1 parent 685a4a8 commit e847355
Show file tree
Hide file tree
Showing 53 changed files with 466 additions and 466 deletions.
82 changes: 41 additions & 41 deletions chapter_attention-mechanisms/attention-scoring-functions.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ import tensorflow as tf
```{.python .input}
#@save
def masked_softmax(X, valid_lens):
"""通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
# `X`: 3D张量,`valid_lens`: 1D或2D 张量
"""通过在最后一个轴上掩蔽元素来执行softmax操作"""
# `X`:3D张量,`valid_lens`:1D或2D张量
if valid_lens is None:
return npx.softmax(X)
else:
Expand All @@ -100,8 +100,8 @@ def masked_softmax(X, valid_lens):
#@tab pytorch
#@save
def masked_softmax(X, valid_lens):
"""通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
# `X`: 3D张量,`valid_lens`: 1D或2D 张量
"""通过在最后一个轴上掩蔽元素来执行softmax操作"""
# `X`:3D张量,`valid_lens`:1D或2D张量
if valid_lens is None:
return nn.functional.softmax(X, dim=-1)
else:
Expand All @@ -120,8 +120,8 @@ def masked_softmax(X, valid_lens):
#@tab tensorflow
#@save
def masked_softmax(X, valid_lens):
"""通过在最后一个轴上掩蔽元素来执行 softmax 操作"""
# `X`: 3D张量,`valid_lens`: 1D或2D 张量
"""通过在最后一个轴上掩蔽元素来执行softmax操作"""
# `X`:3D张量,`valid_lens`:1D或2D张量
if valid_lens is None:
return tf.nn.softmax(X, axis=-1)
else:
Expand Down Expand Up @@ -201,7 +201,7 @@ class AdditiveAttention(nn.Block):
"""加性注意力"""
def __init__(self, num_hiddens, dropout, **kwargs):
super(AdditiveAttention, self).__init__(**kwargs)
# 使用 'flatten=False' 只转换最后一个轴,以便其他轴的形状保持不变
# 使用'flatten=False'只转换最后一个轴,以便其他轴的形状保持不变
self.W_k = nn.Dense(num_hiddens, use_bias=False, flatten=False)
self.W_q = nn.Dense(num_hiddens, use_bias=False, flatten=False)
self.w_v = nn.Dense(1, use_bias=False, flatten=False)
Expand All @@ -210,17 +210,17 @@ class AdditiveAttention(nn.Block):
def forward(self, queries, keys, values, valid_lens):
queries, keys = self.W_q(queries), self.W_k(keys)
# 在维度扩展后,
# `queries` 的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key` 的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# `queries`的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key`的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# 使用广播的方式进行求和
features = np.expand_dims(queries, axis=2) + np.expand_dims(
keys, axis=1)
features = np.tanh(features)
# `self.w_v` 仅有一个输出,因此从形状中移除最后那个维度。
# `scores` 的形状:(`batch_size`,查询的个数,“键-值”对的个数)
# `self.w_v`仅有一个输出,因此从形状中移除最后那个维度。
# `scores`的形状:(`batch_size`,查询的个数,“键-值”对的个数)
scores = np.squeeze(self.w_v(features), axis=-1)
self.attention_weights = masked_softmax(scores, valid_lens)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
return npx.batch_dot(self.dropout(self.attention_weights), values)
```

Expand All @@ -239,24 +239,24 @@ class AdditiveAttention(nn.Module):
def forward(self, queries, keys, values, valid_lens):
queries, keys = self.W_q(queries), self.W_k(keys)
# 在维度扩展后,
# `queries` 的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key` 的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# `queries`的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key`的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# 使用广播方式进行求和
features = queries.unsqueeze(2) + keys.unsqueeze(1)
features = torch.tanh(features)
# `self.w_v` 仅有一个输出,因此从形状中移除最后那个维度。
# `scores` 的形状:(`batch_size`,查询的个数,“键-值”对的个数)
# `self.w_v`仅有一个输出,因此从形状中移除最后那个维度。
# `scores`的形状:(`batch_size`,查询的个数,“键-值”对的个数)
scores = self.w_v(features).squeeze(-1)
self.attention_weights = masked_softmax(scores, valid_lens)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
return torch.bmm(self.dropout(self.attention_weights), values)
```

```{.python .input}
#@tab tensorflow
#@save
class AdditiveAttention(tf.keras.layers.Layer):
"""Additive attention."""
"""Additiveattention."""
def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
super().__init__(**kwargs)
self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=False)
Expand All @@ -267,17 +267,17 @@ class AdditiveAttention(tf.keras.layers.Layer):
def call(self, queries, keys, values, valid_lens, **kwargs):
queries, keys = self.W_q(queries), self.W_k(keys)
# 在维度扩展后,
# `queries` 的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key` 的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# `queries`的形状:(`batch_size`,查询的个数,1,`num_hidden`)
# `key`的形状:(`batch_size`,1,“键-值”对的个数,`num_hiddens`)
# 使用广播方式进行求和
features = tf.expand_dims(queries, axis=2) + tf.expand_dims(
keys, axis=1)
features = tf.nn.tanh(features)
# `self.w_v` 仅有一个输出,因此从形状中移除最后那个维度。
# `scores` 的形状:(`batch_size`,查询的个数,“键-值”对的个数)
# `self.w_v`仅有一个输出,因此从形状中移除最后那个维度。
# `scores`的形状:(`batch_size`,查询的个数,“键-值”对的个数)
scores = tf.squeeze(self.w_v(features), axis=-1)
self.attention_weights = masked_softmax(scores, valid_lens)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
return tf.matmul(self.dropout(
self.attention_weights, **kwargs), values)
```
Expand All @@ -289,7 +289,7 @@ class AdditiveAttention(tf.keras.layers.Layer):

```{.python .input}
queries, keys = d2l.normal(0, 1, (2, 1, 20)), d2l.ones((2, 10, 2))
# `values` 的小批量数据集中,两个值矩阵是相同的
# `values`的小批量数据集中,两个值矩阵是相同的
values = np.arange(40).reshape(1, 10, 4).repeat(2, axis=0)
valid_lens = d2l.tensor([2, 6])
Expand All @@ -301,7 +301,7 @@ attention(queries, keys, values, valid_lens)
```{.python .input}
#@tab pytorch
queries, keys = d2l.normal(0, 1, (2, 1, 20)), d2l.ones((2, 10, 2))
# `values` 的小批量,两个值矩阵是相同的
# `values`的小批量,两个值矩阵是相同的
values = torch.arange(40, dtype=torch.float32).reshape(1, 10, 4).repeat(
2, 1, 1)
valid_lens = d2l.tensor([2, 6])
Expand All @@ -315,7 +315,7 @@ attention(queries, keys, values, valid_lens)
```{.python .input}
#@tab tensorflow
queries, keys = tf.random.normal(shape=(2, 1, 20)), tf.ones((2, 10, 2))
# `values` 的小批量,两个值矩阵是相同的
# `values`的小批量,两个值矩阵是相同的
values = tf.repeat(tf.reshape(
tf.range(40, dtype=tf.float32), shape=(1, 10, 4)), repeats=2, axis=0)
valid_lens = tf.constant([2, 6])
Expand Down Expand Up @@ -369,13 +369,13 @@ class DotProductAttention(nn.Block):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# `queries` 的形状:(`batch_size`,查询的个数,`d`)
# `keys` 的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens` 的形状: (`batch_size`,) 或者 (`batch_size`,查询的个数)
# `queries`的形状:(`batch_size`,查询的个数,`d`)
# `keys`的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens`的形状:(`batch_size`,)或者(`batch_size`,查询的个数)
def forward(self, queries, keys, values, valid_lens=None):
d = queries.shape[-1]
# 设置 `transpose_b=True` 为了交换 `keys` 的最后两个维度
# 设置`transpose_b=True`为了交换`keys`的最后两个维度
scores = npx.batch_dot(queries, keys, transpose_b=True) / math.sqrt(d)
self.attention_weights = masked_softmax(scores, valid_lens)
return npx.batch_dot(self.dropout(self.attention_weights), values)
Expand All @@ -390,13 +390,13 @@ class DotProductAttention(nn.Module):
super(DotProductAttention, self).__init__(**kwargs)
self.dropout = nn.Dropout(dropout)
# `queries` 的形状:(`batch_size`,查询的个数,`d`)
# `keys` 的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens` 的形状: (`batch_size`,) 或者 (`batch_size`,查询的个数)
# `queries`的形状:(`batch_size`,查询的个数,`d`)
# `keys`的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens`的形状:(`batch_size`,)或者(`batch_size`,查询的个数)
def forward(self, queries, keys, values, valid_lens=None):
d = queries.shape[-1]
# 设置 `transpose_b=True` 为了交换 `keys` 的最后两个维度
# 设置`transpose_b=True`为了交换`keys`的最后两个维度
scores = torch.bmm(queries, keys.transpose(1,2)) / math.sqrt(d)
self.attention_weights = masked_softmax(scores, valid_lens)
return torch.bmm(self.dropout(self.attention_weights), values)
Expand All @@ -406,15 +406,15 @@ class DotProductAttention(nn.Module):
#@tab tensorflow
#@save
class DotProductAttention(tf.keras.layers.Layer):
"""Scaled dot product attention."""
"""Scaleddotproductattention."""
def __init__(self, dropout, **kwargs):
super().__init__(**kwargs)
self.dropout = tf.keras.layers.Dropout(dropout)
# `queries` 的形状:(`batch_size`,查询的个数,`d`)
# `keys` 的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values` 的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens` 的形状: (`batch_size`,) 或者 (`batch_size`,查询的个数)
# `queries`的形状:(`batch_size`,查询的个数,`d`)
# `keys`的形状:(`batch_size`,“键-值”对的个数,`d`)
# `values`的形状:(`batch_size`,“键-值”对的个数,值的维度)
# `valid_lens`的形状:(`batch_size`,)或者(`batch_size`,查询的个数)
def call(self, queries, keys, values, valid_lens, **kwargs):
d = queries.shape[-1]
scores = tf.matmul(queries, keys, transpose_b=True)/tf.math.sqrt(
Expand Down
64 changes: 32 additions & 32 deletions chapter_attention-mechanisms/bahdanau-attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,33 +105,33 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
self.dense = nn.Dense(vocab_size, flatten=False)
def init_state(self, enc_outputs, enc_valid_lens, *args):
# `outputs`的形状为 (`num_steps`,`batch_size`,`num_hiddens`)
# `hidden_state[0]`的形状为 (`num_layers`,`batch_size`,`num_hiddens`)
# `outputs`的形状为(`num_steps`,`batch_size`,`num_hiddens`)
# `hidden_state[0]`的形状为(`num_layers`,`batch_size`,`num_hiddens`)
outputs, hidden_state = enc_outputs
return (outputs.swapaxes(0, 1), hidden_state, enc_valid_lens)
def forward(self, X, state):
# `enc_outputs`的形状为 (`batch_size`, `num_steps`, `num_hiddens`).
# `hidden_state[0]`的形状为 (`num_layers`, `batch_size`,
# `enc_outputs`的形状为(`batch_size`,`num_steps`,`num_hiddens`).
# `hidden_state[0]`的形状为(`num_layers`,`batch_size`,
# `num_hiddens`)
enc_outputs, hidden_state, enc_valid_lens = state
# 输出 `X`的形状为 (`num_steps`, `batch_size`, `embed_size`)
# 输出`X`的形状为(`num_steps`,`batch_size`,`embed_size`)
X = self.embedding(X).swapaxes(0, 1)
outputs, self._attention_weights = [], []
for x in X:
# `query`的形状为 (`batch_size`, 1, `num_hiddens`)
# `query`的形状为(`batch_size`,1,`num_hiddens`)
query = np.expand_dims(hidden_state[0][-1], axis=1)
# `context`的形状为 (`batch_size`, 1, `num_hiddens`)
# `context`的形状为(`batch_size`,1,`num_hiddens`)
context = self.attention(
query, enc_outputs, enc_outputs, enc_valid_lens)
# 在特征维度上连结
x = np.concatenate((context, np.expand_dims(x, axis=1)), axis=-1)
# 将 `x` 变形为 (1, `batch_size`, `embed_size` + `num_hiddens`)
# 将`x`变形为(1,`batch_size`,`embed_size`+`num_hiddens`)
out, hidden_state = self.rnn(x.swapaxes(0, 1), hidden_state)
outputs.append(out)
self._attention_weights.append(self.attention.attention_weights)
# 全连接层变换后, `outputs`的形状为
# (`num_steps`, `batch_size`, `vocab_size`)
# 全连接层变换后,`outputs`的形状为
# (`num_steps`,`batch_size`,`vocab_size`)
outputs = self.dense(np.concatenate(outputs, axis=0))
return outputs.swapaxes(0, 1), [enc_outputs, hidden_state,
enc_valid_lens]
Expand All @@ -156,33 +156,33 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, enc_valid_lens, *args):
# `outputs`的形状为 (`batch_size`,`num_steps`,`num_hiddens`).
# `hidden_state`的形状为 (`num_layers`,`batch_size`,`num_hiddens`)
# `outputs`的形状为(`batch_size`,`num_steps`,`num_hiddens`).
# `hidden_state`的形状为(`num_layers`,`batch_size`,`num_hiddens`)
outputs, hidden_state = enc_outputs
return (outputs.permute(1, 0, 2), hidden_state, enc_valid_lens)
def forward(self, X, state):
# `enc_outputs`的形状为 (`batch_size`, `num_steps`, `num_hiddens`).
# `hidden_state`的形状为 (`num_layers`, `batch_size`,
# `enc_outputs`的形状为(`batch_size`,`num_steps`,`num_hiddens`).
# `hidden_state`的形状为(`num_layers`,`batch_size`,
# `num_hiddens`)
enc_outputs, hidden_state, enc_valid_lens = state
# 输出 `X`的形状为 (`num_steps`, `batch_size`, `embed_size`)
# 输出`X`的形状为(`num_steps`,`batch_size`,`embed_size`)
X = self.embedding(X).permute(1, 0, 2)
outputs, self._attention_weights = [], []
for x in X:
# `query`的形状为 (`batch_size`, 1, `num_hiddens`)
# `query`的形状为(`batch_size`,1,`num_hiddens`)
query = torch.unsqueeze(hidden_state[-1], dim=1)
# `context`的形状为 (`batch_size`, 1, `num_hiddens`)
# `context`的形状为(`batch_size`,1,`num_hiddens`)
context = self.attention(
query, enc_outputs, enc_outputs, enc_valid_lens)
# 在特征维度上连结
x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1)
# 将 `x` 变形为 (1, `batch_size`, `embed_size` + `num_hiddens`)
# 将`x`变形为(1,`batch_size`,`embed_size`+`num_hiddens`)
out, hidden_state = self.rnn(x.permute(1, 0, 2), hidden_state)
outputs.append(out)
self._attention_weights.append(self.attention.attention_weights)
# 全连接层变换后, `outputs`的形状为
# (`num_steps`, `batch_size`, `vocab_size`)
# 全连接层变换后,`outputs`的形状为
# (`num_steps`,`batch_size`,`vocab_size`)
outputs = self.dense(torch.cat(outputs, dim=0))
return outputs.permute(1, 0, 2), [enc_outputs, hidden_state,
enc_valid_lens]
Expand All @@ -209,25 +209,25 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
self.dense = tf.keras.layers.Dense(vocab_size)
def init_state(self, enc_outputs, enc_valid_lens, *args):
# `outputs`的形状为 (`num_steps`,`batch_size`,`num_hiddens`)
# `hidden_state[0]`的形状为 (`num_layers`,`batch_size`,`num_hiddens`)
# `outputs`的形状为(`num_steps`,`batch_size`,`num_hiddens`)
# `hidden_state[0]`的形状为(`num_layers`,`batch_size`,`num_hiddens`)
outputs, hidden_state = enc_outputs
return (outputs, hidden_state, enc_valid_lens)
def call(self, X, state, **kwargs):
# `enc_outputs`的形状为 (`batch_size`, `num_steps`, `num_hiddens`).
# `hidden_state[0]`的形状为 (`num_layers`, `batch_size`,
# `enc_outputs`的形状为(`batch_size`,`num_steps`,`num_hiddens`).
# `hidden_state[0]`的形状为(`num_layers`,`batch_size`,
# `num_hiddens`)
enc_outputs, hidden_state, enc_valid_lens = state
# 输出 `X`的形状为 (`num_steps`, `batch_size`, `embed_size`)
X = self.embedding(X) # 输入 `X`的形状为 (`batch_size`, `num_steps`)
# 输出`X`的形状为(`num_steps`,`batch_size`,`embed_size`)
X = self.embedding(X) # 输入`X`的形状为(`batch_size`,`num_steps`)
X = tf.transpose(X, perm=(1, 0, 2))
outputs, self._attention_weights = [], []
for x in X:
# `query`的形状为 (`batch_size`, 1, `num_hiddens`)
# `query`的形状为(`batch_size`,1,`num_hiddens`)
query = tf.expand_dims(hidden_state[-1], axis=1)
# `context`的形状为 (`batch_size`, 1, `num_hiddens`)
# `context`的形状为(`batch_size`,1,`num_hiddens`)
context = self.attention(query, enc_outputs, enc_outputs,
enc_valid_lens, **kwargs)
# 在特征维度上连结
Expand All @@ -236,8 +236,8 @@ class Seq2SeqAttentionDecoder(AttentionDecoder):
hidden_state = out[1:]
outputs.append(out[0])
self._attention_weights.append(self.attention.attention_weights)
# 全连接层变换后, `outputs`的形状为
# (`num_steps`, `batch_size`, `vocab_size`)
# 全连接层变换后,`outputs`的形状为
# (`num_steps`,`batch_size`,`vocab_size`)
outputs = self.dense(tf.concat(outputs, axis=1))
return outputs, [enc_outputs, hidden_state, enc_valid_lens]
Expand All @@ -255,7 +255,7 @@ encoder.initialize()
decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
decoder.initialize()
X = d2l.zeros((4, 7)) # (`batch_size`, `num_steps`)
X = d2l.zeros((4, 7)) # (`batch_size`,`num_steps`)
state = decoder.init_state(encoder(X), None)
output, state = decoder(X, state)
output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
Expand All @@ -269,7 +269,7 @@ encoder.eval()
decoder = Seq2SeqAttentionDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
decoder.eval()
X = d2l.zeros((4, 7), dtype=torch.long) # (`batch_size`, `num_steps`)
X = d2l.zeros((4, 7), dtype=torch.long) # (`batch_size`,`num_steps`)
state = decoder.init_state(encoder(X), None)
output, state = decoder(X, state)
output.shape, len(state), state[0].shape, len(state[1]), state[1][0].shape
Expand Down
Loading

0 comments on commit e847355

Please sign in to comment.