Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Word embeddings update #159

Merged
merged 38 commits into from
Jul 1, 2018
Merged
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
3ab1d2c
Mask accidental hits
leezu Jun 15, 2018
b69382e
Simplify frequent token subsampling
leezu Jun 15, 2018
a74ac4a
Remove tqdm dependency
leezu Jun 15, 2018
28528fa
Simplifications
leezu Jun 18, 2018
68185ac
Support read from vec format
leezu Jun 18, 2018
c9dc46a
Add back DeduplicatedFasttext
leezu Jun 20, 2018
d85af54
Average the subword embeddings for FastText
leezu Jun 21, 2018
73bde08
Fix Fasttext hash function for ngrams containing non-ASCII data
leezu Jun 21, 2018
517a37e
Merge train_word2vec and train_fasttext
leezu Jun 21, 2018
880acb9
Clean up fasttext evaluation binary script
leezu Jun 21, 2018
d4a3874
Remove waitall
leezu Jun 21, 2018
32a1839
Only evaluate at end of training by default
leezu Jun 22, 2018
68ad3f5
Set mxnet env variables
leezu Jun 22, 2018
0007f67
Increase number of subword units considered by default
leezu Jun 22, 2018
9fe1ca6
Update hyperparameters
leezu Jun 22, 2018
9700e11
Fix cbow
leezu Jun 22, 2018
61f9f5f
Use separate batch-size for evaluation
leezu Jun 22, 2018
b338e8d
Fix lint
leezu Jun 22, 2018
09fb6df
Rerun extended_results.ipynb and commit dependant results/*tvs files …
leezu Jun 25, 2018
e215118
Clean up TokenEmbedding API docs
leezu Jun 25, 2018
ab1b5ed
Refactor TokenEmbedding OOV inference
leezu Jun 25, 2018
4d02b7a
Use GluonNLP load_fasttext_model for word embeddings evaluation script
leezu Jun 25, 2018
f3b257b
Add tests
leezu Jun 25, 2018
6eb685f
Remove deprecated to_token_embedding method from train/embedding.py
leezu Jun 26, 2018
35bcb7b
Merge TokenEmbedding.extend in TokenEmbedding.__setitem__
leezu Jun 26, 2018
7da4c6f
Use full link to #11314
leezu Jun 26, 2018
08858d7
Improve test coverage
leezu Jun 26, 2018
5e960fa
Update notebook
leezu Jun 27, 2018
348c46f
Fix doc
leezu Jun 27, 2018
f5cfc84
Cache word ngram hashes
leezu Jun 27, 2018
7e531d4
Move results to dmlc/web-data
leezu Jun 29, 2018
897b000
Move candidate_sampler to scripts
leezu Jun 29, 2018
1637ef7
Update --negative doc
leezu Jun 29, 2018
546a9af
Match old default behavior of TokenEmbedding and add warnings
leezu Jun 29, 2018
4a32070
Match weight context in UnigramCandidateSampler
leezu Jun 29, 2018
c307061
Add Pad test case with empty ndarray input
leezu Jun 29, 2018
e90bd33
Address review comments
leezu Jun 29, 2018
9c79163
Fix doc and superfluous inheritance
leezu Jul 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix cbow
  • Loading branch information
leezu committed Jul 1, 2018
commit 9700e1195289432f74d687f20f69c38f93711548
42 changes: 31 additions & 11 deletions scripts/word_embeddings/train_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,9 +327,18 @@ def train(args):
subwords, subwords_mask = \
indices_to_subwordindices_mask(unique, idx_to_subwordidxs)
elif args.model.lower() == 'cbow':
subwords, subwords_mask = \
indices_to_subwordindices_mask(word_context,
if args.no_deduplicate_words:
subwords, subwords_mask = \
indices_to_subwordindices_mask(word_context,
idx_to_subwordidxs)
else:
unique, inverse_unique_indices = np.unique(
word_context.asnumpy(), return_inverse=True)
unique = mx.nd.array(unique)
inverse_unique_indices = mx.nd.array(
inverse_unique_indices, ctx=context[0])
subwords, subwords_mask = \
indices_to_subwordindices_mask(unique, idx_to_subwordidxs)
else:
logging.error('Unsupported model %s.', args.model)
sys.exit(1)
Expand Down Expand Up @@ -377,25 +386,36 @@ def train(args):
mx.nd.zeros_like(negatives), dim=1)

elif args.model.lower() == 'cbow':
emb_in = embedding(word_context, word_context_mask,
subwords, subwords_mask).sum(
axis=-2, keepdims=True)
word_context = word_context.reshape((-3, 1))
word_context_mask = word_context_mask.reshape((-3, 1))
if args.ngram_buckets and args.no_deduplicate_words:
emb_in = embedding(word_context, word_context_mask,
subwords, subwords_mask).sum(
axis=-2, keepdims=True)
elif args.ngram_buckets:
emb_in = embedding(word_context, word_context_mask,
subwords, subwords_mask,
inverse_unique_indices)
else:
emb_in = embedding(word_context, word_context_mask)

with mx.autograd.pause():
center = center.tile(args.window * 2).reshape((-1, 1))
negatives = negatives.reshape((-1, args.negative))

center_negatives = mx.nd.concat(
center.expand_dims(1), negatives, dim=1)
center_negatives_mask = mx.nd.concat(
center_mask.expand_dims(1),
mx.nd.ones_like(negatives), dim=1)
center, negatives, dim=1)
center_negatives_mask = mx.nd.ones_like(
center_negatives)

emb_out = embedding_out(center_negatives,
center_negatives_mask)

# Compute loss
pred = mx.nd.batch_dot(emb_in, emb_out.swapaxes(1, 2))
pred = pred.reshape((-1, 1 + args.negative))
pred = pred.squeeze() * word_context_mask
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand we want to minimize the dot(l(s(w_t,w_c)) + l(-s(w_t,w_c))), but what do we need word_context_mask mean here?

label = mx.nd.concat(
mx.nd.ones_like(center).expand_dims(1),
mx.nd.ones_like(word_context),
mx.nd.zeros_like(negatives), dim=1)

loss = loss_function(pred, label)
Expand Down