Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Word embeddings update #159

Merged
merged 38 commits into from
Jul 1, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
3ab1d2c
Mask accidental hits
leezu Jun 15, 2018
b69382e
Simplify frequent token subsampling
leezu Jun 15, 2018
a74ac4a
Remove tqdm dependency
leezu Jun 15, 2018
28528fa
Simplifications
leezu Jun 18, 2018
68185ac
Support read from vec format
leezu Jun 18, 2018
c9dc46a
Add back DeduplicatedFasttext
leezu Jun 20, 2018
d85af54
Average the subword embeddings for FastText
leezu Jun 21, 2018
73bde08
Fix Fasttext hash function for ngrams containing non-ASCII data
leezu Jun 21, 2018
517a37e
Merge train_word2vec and train_fasttext
leezu Jun 21, 2018
880acb9
Clean up fasttext evaluation binary script
leezu Jun 21, 2018
d4a3874
Remove waitall
leezu Jun 21, 2018
32a1839
Only evaluate at end of training by default
leezu Jun 22, 2018
68ad3f5
Set mxnet env variables
leezu Jun 22, 2018
0007f67
Increase number of subword units considered by default
leezu Jun 22, 2018
9fe1ca6
Update hyperparameters
leezu Jun 22, 2018
9700e11
Fix cbow
leezu Jun 22, 2018
61f9f5f
Use separate batch-size for evaluation
leezu Jun 22, 2018
b338e8d
Fix lint
leezu Jun 22, 2018
09fb6df
Rerun extended_results.ipynb and commit dependant results/*tvs files …
leezu Jun 25, 2018
e215118
Clean up TokenEmbedding API docs
leezu Jun 25, 2018
ab1b5ed
Refactor TokenEmbedding OOV inference
leezu Jun 25, 2018
4d02b7a
Use GluonNLP load_fasttext_model for word embeddings evaluation script
leezu Jun 25, 2018
f3b257b
Add tests
leezu Jun 25, 2018
6eb685f
Remove deprecated to_token_embedding method from train/embedding.py
leezu Jun 26, 2018
35bcb7b
Merge TokenEmbedding.extend in TokenEmbedding.__setitem__
leezu Jun 26, 2018
7da4c6f
Use full link to #11314
leezu Jun 26, 2018
08858d7
Improve test coverage
leezu Jun 26, 2018
5e960fa
Update notebook
leezu Jun 27, 2018
348c46f
Fix doc
leezu Jun 27, 2018
f5cfc84
Cache word ngram hashes
leezu Jun 27, 2018
7e531d4
Move results to dmlc/web-data
leezu Jun 29, 2018
897b000
Move candidate_sampler to scripts
leezu Jun 29, 2018
1637ef7
Update --negative doc
leezu Jun 29, 2018
546a9af
Match old default behavior of TokenEmbedding and add warnings
leezu Jun 29, 2018
4a32070
Match weight context in UnigramCandidateSampler
leezu Jun 29, 2018
c307061
Add Pad test case with empty ndarray input
leezu Jun 29, 2018
e90bd33
Address review comments
leezu Jun 29, 2018
9c79163
Fix doc and superfluous inheritance
leezu Jul 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Merge TokenEmbedding.extend in TokenEmbedding.__setitem__
Previously __setitem__ was only allowed to update known tokens.
  • Loading branch information
leezu committed Jul 1, 2018
commit 35bcb7bc0a2852b052b705ac945366275313a96d
86 changes: 47 additions & 39 deletions gluonnlp/embedding/token_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,20 +158,26 @@ class TokenEmbedding(object):
init_unknown_vec : callback
The callback used to initialize the embedding vector for the unknown
token. Only used if `unknown_token` is not None.
allow_extend : bool, default True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

default True (1 space)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks

If True, embedding vectors for previously unknown words can be added
via token_embedding[tokens] = vecs. If False, only vectors for known
tokens can be updated.
unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None
If not None, unknown_lookup[tokens] is called for any unknown tokens.
The result is cached if unknown_autoextend is True.
unknown_autoextend : bool, default True
If True, any unknown token for which a vector was looked up in
unknown_lookup together with the resulting vector will be added to
token_to_idx, idx_to_token and idx_to_vec, adding a new index.
token_to_idx, idx_to_token and idx_to_vec, adding a new index. This
option is ignored if allow_extend is False.

"""

def __init__(self, unknown_token='<unk>', init_unknown_vec=nd.zeros,
def __init__(self, unknown_token='<unk>', init_unknown_vec=nd.zeros, allow_extend=True,
unknown_lookup=None, unknown_autoextend=True):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set default values so that it's consistent with existing behavior

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

self._unknown_token = unknown_token
self._init_unknown_vec = init_unknown_vec
self._allow_extend = allow_extend
self._unknown_lookup = unknown_lookup
self._unknown_autoextend = unknown_autoextend
self._idx_to_token = [unknown_token] if unknown_token else []
Expand Down Expand Up @@ -424,6 +430,23 @@ def unknown_token(self):
"""
return self._unknown_token

@property
def allow_extend(self):
"""Allow extension of the TokenEmbedding with new tokens.

If True, `TokenEmbedding[tokens] = vec` can introduce new tokens that
were previously unknown. New indices will be assigned to the newly
introduced tokens. If False, only known tokens can be updated.

Returns
-------
bool:
Extension of the TokenEmbedding is allowed.

"""
return self._allow_extend


@property
def unknown_lookup(self):
"""Vector lookup for unknown tokens.
Expand Down Expand Up @@ -505,7 +528,7 @@ def __getitem__(self, tokens):
else:
if self.unknown_lookup is not None and self.unknown_autoextend:
new_tokens = [t for t in tokens if t not in self.token_to_idx]
self.extend(new_tokens, self.unknown_lookup[new_tokens])
self[new_tokens] = self.unknown_lookup[new_tokens]

indices = [self._token_to_idx[token] for token in tokens]
vecs = nd.Embedding(
Expand All @@ -515,7 +538,7 @@ def __getitem__(self, tokens):
return vecs[0] if to_reduce else vecs

def _check_vector_update(self, tokens, new_embedding):
"""Check that tokens and embedding are in the format for __setitem__ and extend."""
"""Check that tokens and embedding are in the format for __setitem__."""
assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.'

if not isinstance(tokens, (list, tuple)) or len(tokens) == 1:
Expand All @@ -539,6 +562,8 @@ def _check_vector_update(self, tokens, new_embedding):
def __setitem__(self, tokens, new_embedding):
"""Updates embedding vectors for tokens.

If self.allow_extend is True, vectors for previously unknown tokens can be introduced.

Parameters
----------
tokens : hashable object or a list or tuple of hashable objects
Expand All @@ -549,8 +574,26 @@ def __setitem__(self, tokens, new_embedding):
the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
of multiple strings, it must be 2-D.
"""
if self.allow_extend and self._idx_to_vec is None:
# Initialize self._idx_to_vec
assert C.UNK_IDX == 0
self._idx_to_vec = self._init_unknown_vec(shape=(1, new_embedding.shape[-1]))

tokens = self._check_vector_update(tokens, new_embedding)

if self.allow_extend:
# Add new / previously unknown tokens
for token in filter(lambda t: t not in self._token_to_idx, tokens):
idx = len(self._token_to_idx)
self._token_to_idx[token] = idx
self._idx_to_token.append(token)

# Extend shape of idx_to_vec
idx_to_vec = nd.zeros(shape=(len(self._token_to_idx),
self.idx_to_vec.shape[1]))
idx_to_vec[:self.idx_to_vec.shape[0]] = self._idx_to_vec
self._idx_to_vec = idx_to_vec

indices = []
for token in tokens:
if token in self._token_to_idx:
Expand All @@ -568,41 +611,6 @@ def __setitem__(self, tokens, new_embedding):

self._idx_to_vec[nd.array(indices)] = new_embedding

def extend(self, tokens, embedding):
"""Adds tokens using the vectors in embedding.

Parameters
----------
tokens : hashable object or a list or tuple of hashable objects
A token or a list of tokens whose embedding vector are to be updated.
embedding : mxnet.ndarray.NDArray
An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal
to the number of `tokens` and its width must be equal to the dimension of embedding of
the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
of multiple strings, it must be 2-D.
"""
if self._idx_to_vec is None:
assert C.UNK_IDX == 0
self._idx_to_vec = self._init_unknown_vec(shape=(1, embedding.shape[-1]))

tokens = self._check_vector_update(tokens, embedding)

for token in tokens:
if token in self._token_to_idx:
raise KeyError('Token "{token}" is known. '
'Use `token_embedding["{token}"] = embedding` '
' to update its embedding '.format(token=token))

idx_to_vec = nd.empty(shape=(self.idx_to_vec.shape[0] + len(tokens),
self.idx_to_vec.shape[1]))
idx_to_vec[:self.idx_to_vec.shape[0]] = self._idx_to_vec
idx_to_vec[self.idx_to_vec.shape[0]:] = embedding

self._token_to_idx.update(
(token, i) for i, token in enumerate(tokens, self.idx_to_vec.shape[0]))
self._idx_to_vec = idx_to_vec
self._idx_to_token += tokens

@classmethod
def _check_source(cls, source):
"""Checks if a pre-trained token embedding source name is valid.
Expand Down
2 changes: 1 addition & 1 deletion scripts/word_embeddings/evaluate_pretrained.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def load_embedding_from_path(args):
'for {} words.'.format(len(token_set))):
embedding = nlp.embedding.TokenEmbedding(unknown_token=None)
idx_to_tokens = list(token_set)
embedding.extend(idx_to_tokens, model[idx_to_tokens])
embedding[idx_to_tokens] = model[idx_to_tokens]

else:
embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)
Expand Down
2 changes: 1 addition & 1 deletion scripts/word_embeddings/train_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,7 @@ def evaluate(args, embedding, vocab, global_step, eval_analogy=False):
mx.nd.waitall()

token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None)
token_embedding.extend(eval_tokens, embedding[eval_tokens])
token_embedding[eval_tokens] = embedding[eval_tokens]

results = evaluation.evaluate_similarity(
args, token_embedding, context[0], logfile=os.path.join(
Expand Down
27 changes: 16 additions & 11 deletions tests/unittest/test_vocab_embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import re
import os
import sys
import functools

import pytest

Expand Down Expand Up @@ -370,17 +371,20 @@ def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file):
fout.write(seqs)


def test_token_embedding_from_file(tmpdir):
@pytest.mark.parametrize('allow_extend', [True, False])
def test_token_embedding_from_file(tmpdir, allow_extend):
embed_root = str(tmpdir)
embed_name = 'my_embed'
elem_delim = '\t'
pretrain_file = 'my_pretrain_file.txt'

from_file = functools.partial(nlp.embedding.TokenEmbedding.from_file, allow_extend=allow_extend)

_mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file)

pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)

my_embed = nlp.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim)
my_embed = from_file(pretrain_file_path, elem_delim)

assert 'a' in my_embed
assert my_embed.unknown_token == '<unk>'
Expand All @@ -406,11 +410,16 @@ def test_token_embedding_from_file(tmpdir):
a_vec = my_embed['a']
assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5]))

my_embed = from_file(pretrain_file_path, elem_delim)
# Test __setitem__.
my_embed['a'] = nd.array([1, 2, 3, 4, 5])
assert_almost_equal(my_embed['a'].asnumpy(), np.array([1, 2, 3, 4, 5]))
with pytest.raises(KeyError):
if allow_extend:
my_embed['unknown$$$'] = nd.array([0, 0, 0, 0, 0])
assert_almost_equal(my_embed['unknown$$$'].asnumpy(), np.array([0, 0, 0, 0, 0]))
else:
with pytest.raises(KeyError):
my_embed['unknown$$$'] = nd.array([0, 0, 0, 0, 0])
with pytest.raises(AssertionError):
my_embed['<unk>'] = nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
with pytest.raises(AssertionError):
Expand All @@ -423,17 +432,13 @@ def test_token_embedding_from_file(tmpdir):
pretrain_file2 = 'my_pretrain_file2.txt'
_mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2)
pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2)
my_embed2 = nlp.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim,
init_unknown_vec=nd.ones,
unknown_token='<unk>')
my_embed2 = from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones, unknown_token='<unk>')
unk_vec2 = my_embed2['<unk>']
assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))
unk_vec2 = my_embed2['<unk$unk@unk>']
assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))

my_embed3 = nlp.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim,
init_unknown_vec=nd.ones,
unknown_token='<unk1>')
my_embed3 = from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones, unknown_token='<unk1>')
unk_vec3 = my_embed3['<unk1>']
assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5]))
unk_vec3 = my_embed3['<unk$unk@unk>']
Expand All @@ -445,14 +450,14 @@ def test_token_embedding_from_file(tmpdir):
invalid_pretrain_file)
pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file)
with pytest.raises(AssertionError):
nlp.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim)
from_file(pretrain_file_path, elem_delim)

invalid_pretrain_file2 = 'invalid_pretrain_file2.txt'
_mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim,
invalid_pretrain_file2)
pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file2)
with pytest.raises(AssertionError):
nlp.embedding.TokenEmbedding.from_file(pretrain_file_path, elem_delim)
from_file(pretrain_file_path, elem_delim)


def test_embedding_get_and_pretrain_file_names():
Expand Down