Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Word embeddings update #159

Merged
merged 38 commits into from
Jul 1, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
3ab1d2c
Mask accidental hits
leezu Jun 15, 2018
b69382e
Simplify frequent token subsampling
leezu Jun 15, 2018
a74ac4a
Remove tqdm dependency
leezu Jun 15, 2018
28528fa
Simplifications
leezu Jun 18, 2018
68185ac
Support read from vec format
leezu Jun 18, 2018
c9dc46a
Add back DeduplicatedFasttext
leezu Jun 20, 2018
d85af54
Average the subword embeddings for FastText
leezu Jun 21, 2018
73bde08
Fix Fasttext hash function for ngrams containing non-ASCII data
leezu Jun 21, 2018
517a37e
Merge train_word2vec and train_fasttext
leezu Jun 21, 2018
880acb9
Clean up fasttext evaluation binary script
leezu Jun 21, 2018
d4a3874
Remove waitall
leezu Jun 21, 2018
32a1839
Only evaluate at end of training by default
leezu Jun 22, 2018
68ad3f5
Set mxnet env variables
leezu Jun 22, 2018
0007f67
Increase number of subword units considered by default
leezu Jun 22, 2018
9fe1ca6
Update hyperparameters
leezu Jun 22, 2018
9700e11
Fix cbow
leezu Jun 22, 2018
61f9f5f
Use separate batch-size for evaluation
leezu Jun 22, 2018
b338e8d
Fix lint
leezu Jun 22, 2018
09fb6df
Rerun extended_results.ipynb and commit dependant results/*tvs files …
leezu Jun 25, 2018
e215118
Clean up TokenEmbedding API docs
leezu Jun 25, 2018
ab1b5ed
Refactor TokenEmbedding OOV inference
leezu Jun 25, 2018
4d02b7a
Use GluonNLP load_fasttext_model for word embeddings evaluation script
leezu Jun 25, 2018
f3b257b
Add tests
leezu Jun 25, 2018
6eb685f
Remove deprecated to_token_embedding method from train/embedding.py
leezu Jun 26, 2018
35bcb7b
Merge TokenEmbedding.extend in TokenEmbedding.__setitem__
leezu Jun 26, 2018
7da4c6f
Use full link to #11314
leezu Jun 26, 2018
08858d7
Improve test coverage
leezu Jun 26, 2018
5e960fa
Update notebook
leezu Jun 27, 2018
348c46f
Fix doc
leezu Jun 27, 2018
f5cfc84
Cache word ngram hashes
leezu Jun 27, 2018
7e531d4
Move results to dmlc/web-data
leezu Jun 29, 2018
897b000
Move candidate_sampler to scripts
leezu Jun 29, 2018
1637ef7
Update --negative doc
leezu Jun 29, 2018
546a9af
Match old default behavior of TokenEmbedding and add warnings
leezu Jun 29, 2018
4a32070
Match weight context in UnigramCandidateSampler
leezu Jun 29, 2018
c307061
Add Pad test case with empty ndarray input
leezu Jun 29, 2018
e90bd33
Address review comments
leezu Jun 29, 2018
9c79163
Fix doc and superfluous inheritance
leezu Jul 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Address review comments
  • Loading branch information
leezu committed Jul 1, 2018
commit e90bd33c3ebd6e196e39ecbe1a64e45c50c44c4f
30 changes: 30 additions & 0 deletions gluonnlp/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# coding: utf-8

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

# pylint: disable=abstract-method
"""Helper functions."""

__all__ = ['_str_types']

import sys

if sys.version_info[0] == 3:
_str_types = (str, )
else:
_str_types = (str, unicode)
1 change: 0 additions & 1 deletion gluonnlp/data/candidate_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
__all__ = ['CandidateSampler', 'UnigramCandidateSampler']

import mxnet as mx
import numpy as np


class CandidateSampler(object):
Expand Down
54 changes: 26 additions & 28 deletions gluonnlp/model/train/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,13 @@

import logging
import struct
import sys

import numpy as np
from mxnet import cpu, nd
from mxnet.gluon import Block, HybridBlock, nn

from ...vocab import create_subword_function

if sys.version_info[0] == 3:
_str_types = (str, )
else:
_str_types = (str, unicode)
from ...base import _str_types


class EmbeddingModel(Block):
Expand Down Expand Up @@ -110,23 +105,27 @@ class SimpleEmbeddingModel(EmbeddingModel, Block):
Initializer for the embeddings matrix.
sparse_grad : bool, default True
Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
dtype : str, default 'float32'
dtype argument passed to gluon.nn.Embedding

"""

def __init__(self, token_to_idx, embedding_size, weight_initializer=None,
sparse_grad=True, **kwargs):
sparse_grad=True, dtype='float32', **kwargs):
assert isinstance(token_to_idx, dict)

super(SimpleEmbeddingModel,
self).__init__(embedding_size=embedding_size, **kwargs)
self.token_to_idx = token_to_idx
self.weight_initializer = weight_initializer
self.sparse_grad = sparse_grad
self.dtype = dtype

with self.name_scope():
self.embedding = nn.Embedding(
len(token_to_idx), embedding_size,
weight_initializer=weight_initializer, sparse_grad=sparse_grad)
weight_initializer=weight_initializer, sparse_grad=sparse_grad,
dtype=dtype)

def __call__(self, words, wordsmask=None):
return super(SimpleEmbeddingModel, self).__call__(words, wordsmask)
Expand All @@ -139,7 +138,7 @@ def forward(self, words, wordsmask=None):
words : mx.nd.NDArray
Array of token indices.
wordsmask : mx.nd.NDArray
Mask for embeddings returend by the word level embedding operator.
Mask for embeddings returned by the word level embedding operator.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

returend -> returned

"""
#pylint: disable=arguments-differ
Expand Down Expand Up @@ -186,20 +185,19 @@ def __getitem__(self, tokens):

class _MaskedSumEmbedding(HybridBlock):
def __init__(self, num_tokens, embedding_size, weight_initializer=None,
sparse_grad=True, **kwargs):
sparse_grad=True, dtype='float32', **kwargs):
super(_MaskedSumEmbedding, self).__init__(**kwargs)
self.num_tokens = num_tokens
self.embedding_size = embedding_size
self.weight_initializer = weight_initializer
self.sparse_grad = sparse_grad
self.dtype = dtype

with self.name_scope():
self.embedding = nn.Embedding(
num_tokens,
embedding_size,
weight_initializer=weight_initializer,
sparse_grad=sparse_grad,
)
num_tokens, embedding_size,
weight_initializer=weight_initializer, sparse_grad=sparse_grad,
dtype=dtype)

def hybrid_forward(self, F, x, mask):
#pylint: disable=arguments-differ
Expand All @@ -208,7 +206,7 @@ def hybrid_forward(self, F, x, mask):
return F.sum(masked_embeddings, axis=-2)


class FasttextEmbeddingModel(EmbeddingModel, Block):
class FasttextEmbeddingModel(EmbeddingModel):
"""FastText embedding model.

The FasttextEmbeddingModel combines a word level embedding matrix and a
Expand All @@ -234,32 +232,32 @@ class FasttextEmbeddingModel(EmbeddingModel, Block):
Initializer for the embeddings and subword embeddings matrix.
sparse_grad : bool, default True
Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
dtype : str, default 'float32'
dtype argument passed to gluon.nn.Embedding

"""
FASTTEXT_FILEFORMAT_MAGIC = 793712314

def __init__(self, token_to_idx, subword_function, embedding_size,
weight_initializer=None, sparse_grad=True, **kwargs):
weight_initializer=None, sparse_grad=True, dtype='float32',
**kwargs):
super(FasttextEmbeddingModel,
self).__init__(embedding_size=embedding_size, **kwargs)
self.token_to_idx = token_to_idx
self.subword_function = subword_function
self.weight_initializer = weight_initializer
self.sparse_grad = sparse_grad
self.dtype = dtype

with self.name_scope():
self.embedding = nn.Embedding(
len(token_to_idx),
embedding_size,
weight_initializer=weight_initializer,
sparse_grad=sparse_grad,
)
len(token_to_idx), embedding_size,
weight_initializer=weight_initializer, sparse_grad=sparse_grad,
dtype=dtype)
self.subword_embedding = _MaskedSumEmbedding(
len(subword_function),
embedding_size,
weight_initializer=weight_initializer,
sparse_grad=sparse_grad,
)
len(subword_function), embedding_size,
weight_initializer=weight_initializer, sparse_grad=sparse_grad,
dtype=dtype)

@classmethod
def load_fasttext_format(cls, path, ctx=cpu(), **kwargs):
Expand Down Expand Up @@ -451,7 +449,7 @@ def forward(self, words, subwords, wordsmask=None, subwordsmask=None,
subwords of the unique tokens in `words` with
`words_to_unique_subwords_indices` containing the reverse mapping.
wordsmask : mx.nd.NDArray, optional
Mask for embeddings returend by the word level embedding operator.
Mask for embeddings returned by the word level embedding operator.
subwordsmask : mx.nd.NDArray, optional
A mask for the subword embeddings looked up from `subwords`.
Applied before sum reducing the subword embeddings.
Expand Down
6 changes: 1 addition & 5 deletions scripts/word_embeddings/candidate_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@
# under the License.
"""Candidate samplers"""

__all__ = ['UnigramCandidateSampler']
__all__ = ['remove_accidental_hits']

import mxnet as mx
import numpy as np

import gluonnlp as nlp

try:
from numba import njit
numba_njit = njit(nogil=True)
Expand Down Expand Up @@ -70,5 +68,3 @@ def remove_accidental_hits(candidates, true_samples, true_samples_mask=None):
_candidates_mask(candidates_np, true_samples_np, true_samples_mask_np))

return candidates, candidates_mask.as_in_context(candidates.context)