Data cleanup and additions from the wikitables branch (#1084)

* Data cleanup and additions from the wikitables branch * Added docs
allenai · Apr 13, 2018 · 78dc1df · 78dc1df
1 parent 6829edb
commit 78dc1df
Show file tree

Hide file tree

Showing 10 changed files with 147 additions and 25 deletions.
diff --git a/allennlp/data/dataset_readers/__init__.py b/allennlp/data/dataset_readers/__init__.py
@@ -6,15 +6,16 @@
 and produce datasets in the formats required by specific models.
 """
 
+# pylint: disable=line-too-long
 from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
+from allennlp.data.dataset_readers.coreference_resolution import ConllCorefReader, WinobiasReader
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.data.dataset_readers.language_modeling import LanguageModelingReader
+from allennlp.data.dataset_readers.penn_tree_bank import PennTreeBankConstituencySpanDatasetReader
 from allennlp.data.dataset_readers.reading_comprehension import SquadReader, TriviaQaReader
-from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
-from allennlp.data.dataset_readers.snli import SnliReader
 from allennlp.data.dataset_readers.semantic_role_labeling import SrlReader
 from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
-from allennlp.data.dataset_readers.coreference_resolution import ConllCorefReader, WinobiasReader
-from allennlp.data.dataset_readers.penn_tree_bank import PennTreeBankConstituencySpanDatasetReader
+from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
+from allennlp.data.dataset_readers.snli import SnliReader
 from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import (
         StanfordSentimentTreeBankDatasetReader)
diff --git a/allennlp/data/fields/label_field.py b/allennlp/data/fields/label_field.py
@@ -36,10 +36,11 @@ class LabelField(Field[torch.Tensor]):
         If your labels are 0-indexed integers, you can pass in this flag, and we'll skip the indexing
         step.  If this is ``False`` and your labels are not strings, this throws a ``ConfigurationError``.
     """
-    # It is possible that users want to use this field with a namespace which uses OOV/PAD tokens.
-    # This warning will be repeated for every instantiation of this class (i.e for every data
-    # instance), spewing a lot of warnings so this class variable is used to only log a single
-    # warning per namespace.
+    # Most often, you probably don't want to have OOV/PAD tokens with a LabelField, so we warn you
+    # about it when you pick a namespace that will getting these tokens by default.  It is
+    # possible, however, that you _do_ actually want OOV/PAD tokens with this Field.  This class
+    # variable is used to make sure that we only log a single warning for this per namespace, and
+    # not every time you create one of these Field objects.
     _already_warned_namespaces: Set[str] = set()
 
     def __init__(self,

diff --git a/allennlp/data/iterators/__init__.py b/allennlp/data/iterators/__init__.py
@@ -4,6 +4,7 @@
 """
 
 from allennlp.data.iterators.data_iterator import DataIterator
+from allennlp.data.iterators.adaptive_iterator import AdaptiveIterator
 from allennlp.data.iterators.basic_iterator import BasicIterator
 from allennlp.data.iterators.bucket_iterator import BucketIterator
-from allennlp.data.iterators.adaptive_iterator import AdaptiveIterator
+from allennlp.data.iterators.epoch_tracking_bucket_iterator import EpochTrackingBucketIterator
diff --git a/allennlp/data/iterators/epoch_tracking_bucket_iterator.py b/allennlp/data/iterators/epoch_tracking_bucket_iterator.py
@@ -0,0 +1,77 @@
+import logging
+from typing import List, Tuple, Dict, Iterable, Generator, Union
+from collections import defaultdict
+
+from overrides import overrides
+import numpy
+
+from allennlp.data.fields import MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.iterators.data_iterator import DataIterator
+from allennlp.data.iterators.bucket_iterator import BucketIterator
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@DataIterator.register("epoch_tracking_bucket")
+class EpochTrackingBucketIterator(BucketIterator):
+    """
+    This is essentially a :class:`allennlp.data.iterators.BucketIterator` with just one difference.
+    It keeps track of the epoch number, and adds that as an additional meta field to each instance.
+    That way, ``Model.forward`` will have access to this information. We do this by keeping track of
+    epochs globally, and incrementing them whenever the iterator is called. However, the iterator is
+    called both for training and validation sets. So, we keep a dict of epoch numbers, one key per
+    dataset.
+
+    Parameters
+    ----------
+    See :class:`BucketIterator`.
+    """
+    def __init__(self,
+                 sorting_keys: List[Tuple[str, str]],
+                 padding_noise: float = 0.1,
+                 biggest_batch_first: bool = False,
+                 batch_size: int = 32,
+                 instances_per_epoch: int = None,
+                 max_instances_in_memory: int = None) -> None:
+        super(EpochTrackingBucketIterator, self).__init__(sorting_keys=sorting_keys,
+                                                          padding_noise=padding_noise,
+                                                          biggest_batch_first=biggest_batch_first,
+                                                          batch_size=batch_size,
+                                                          instances_per_epoch=instances_per_epoch,
+                                                          max_instances_in_memory=max_instances_in_memory)
+        # Epoch number value per dataset.
+        self._global_epoch_nums: Dict[int, int] = defaultdict(int)
+
+    @overrides
+    def __call__(self,
+                 instances: Iterable[Instance],
+                 num_epochs: int = None,
+                 shuffle: bool = True,
+                 cuda_device: int = -1,
+                 for_training: bool = True) -> Generator[Dict[str, Union[numpy.ndarray,
+                                                                         Dict[str, numpy.ndarray]]],
+                                                         None, None]:
+        """
+        See ``DataIterator.__call__`` for parameters.
+        """
+        dataset_id = id(instances)
+        if num_epochs is None:
+            while True:
+                self._add_epoch_num_to_instances(instances, dataset_id)
+                yield from self._yield_one_epoch(instances, shuffle, cuda_device, for_training)
+                self._global_epoch_nums[dataset_id] += 1
+        else:
+            for _ in range(num_epochs):
+                self._add_epoch_num_to_instances(instances, dataset_id)
+                yield from self._yield_one_epoch(instances, shuffle, cuda_device, for_training)
+                self._global_epoch_nums[dataset_id] += 1
+
+    def _add_epoch_num_to_instances(self,
+                                    instances: Iterable[Instance],
+                                    dataset_id: int) -> None:
+        for instance in instances:
+            # TODO(pradeep): Mypy complains here most probably because ``fields`` is typed as a
+            # ``Mapping``, and not a ``Dict``. Ignoring this for now, but the type of fields
+            # probably needs to be changed.
+            instance.fields["epoch_num"] = MetadataField(self._global_epoch_nums[dataset_id])  #type: ignore
diff --git a/allennlp/data/tokenizers/token.py b/allennlp/data/tokenizers/token.py
@@ -1,15 +1,17 @@
 class Token:
     """
     A simple token representation, keeping track of the token's text, offset in the passage it was
-    taken from, POS tag, and dependency relation.  These fields match spacy's exactly, so we can
-    just use a spacy token for this.
+    taken from, POS tag, dependency relation, and similar information.  These fields match spacy's
+    exactly, so we can just use a spacy token for this.
 
     Parameters
     ----------
     text : ``str``, optional
         The original text represented by this token.
     idx : ``int``, optional
         The character offset of this token into the tokenized passage.
+    lemma : ``str``, optional
+        The lemma of this token.
     pos : ``str``, optional
         The coarse-grained part of speech of this token.
     tag : ``str``, optional
@@ -31,13 +33,15 @@ class Token:
     def __init__(self,
                  text: str = None,
                  idx: int = None,
+                 lemma: str = None,
                  pos: str = None,
                  tag: str = None,
                  dep: str = None,
                  ent_type: str = None,
                  text_id: int = None) -> None:
         self.text = text
         self.idx = idx
+        self.lemma_ = lemma
         self.pos_ = pos
         self.tag_ = tag
         self.dep_ = dep

diff --git a/allennlp/data/tokenizers/word_stemmer.py b/allennlp/data/tokenizers/word_stemmer.py
@@ -51,4 +51,11 @@ def __init__(self):
     @overrides
     def stem_word(self, word: Token) -> Token:
         new_text = self.stemmer.stem(word.text)
-        return Token(new_text, word.idx, word.pos_, word.tag_, word.dep_, getattr(word, 'text_id', None))
+        return Token(text=new_text,
+                     idx=word.idx,
+                     lemma=word.lemma_,
+                     pos=word.pos_,
+                     tag=word.tag_,
+                     dep=word.dep_,
+                     ent_type=word.ent_type_,
+                     text_id=getattr(word, 'text_id', None))
diff --git a/allennlp/data/tokenizers/word_tokenizer.py b/allennlp/data/tokenizers/word_tokenizer.py
@@ -35,19 +35,6 @@ class WordTokenizer(Tokenizer):
         If given, these tokens will be added to the beginning of every string we tokenize.
     end_tokens : ``List[str]``, optional
         If given, these tokens will be added to the end of every string we tokenize.
-    language : ``str``, optional
-        We use spacy to tokenize strings; this option specifies which language to use.  By default
-        we use English.
-    pos_tags : ``bool``, optional
-        By default we do not load spacy's tagging model, to save loading time and memory.  Set this
-        to ``True`` if you want to have access to spacy's POS tags in the returned tokens.
-    parse : ``bool``, optional
-        By default we do not load spacy's parsing model, to save loading time and memory.  Set this
-        to ``True`` if you want to have access to spacy's dependency parse tags in the returned
-        tokens.
-    ner : ``bool``, optional
-        By default we do not load spacy's parsing model, to save loading time and memory.  Set this
-        to ``True`` if you want to have access to spacy's NER tags in the returned tokens.
     """
     def __init__(self,
                  word_splitter: WordSplitter = None,

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -100,6 +100,7 @@ def _read_pretrained_words(embeddings_filename: str)-> Set[str]:
             words.add(word)
     return words
 
+
 class Vocabulary:
     """
     A Vocabulary maps strings to integers, allowing for strings to be mapped to an
@@ -382,6 +383,12 @@ def from_params(cls, params: Params, instances: Iterable['adi.Instance'] = None)
                                          pretrained_files=pretrained_files,
                                          only_include_pretrained_words=only_include_pretrained_words)
 
+    def is_padded(self, namespace: str) -> bool:
+        """
+        Returns whether or not there are padding and OOV tokens added to the given namepsace.
+        """
+        return self._index_to_token[namespace][0] == self._padding_token
+
     def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
         """
         Adds ``token`` to the index, if it is not already present.  Either way, we return the index of
@@ -401,6 +408,9 @@ def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
     def get_index_to_token_vocabulary(self, namespace: str = 'tokens') -> Dict[int, str]:
         return self._index_to_token[namespace]
 
+    def get_token_to_index_vocabulary(self, namespace: str = 'tokens') -> Dict[str, int]:
+        return self._token_to_index[namespace]
+
     def get_token_index(self, token: str, namespace: str = 'tokens') -> int:
         if token in self._token_to_index[namespace]:
             return self._token_to_index[namespace][token]

diff --git a/doc/api/allennlp.data.iterators.rst b/doc/api/allennlp.data.iterators.rst
@@ -10,6 +10,7 @@ allennlp.data.iterators
 * :ref:`AdaptiveIterator<adaptive-iterator>`
 * :ref:`BasicIterator<basic-iterator>`
 * :ref:`BucketIterator<bucket-iterator>`
+* :ref:`EpochTrackingBucketIterator<epoch-tracking-bucket-iterator>`
 
 .. _data-iterator:
 .. automodule:: allennlp.data.iterators.data_iterator
@@ -34,3 +35,9 @@ allennlp.data.iterators
    :members:
    :undoc-members:
    :show-inheritance:
+
+.. _epoch-tracking-bucket-iterator:
+.. automodule:: allennlp.data.iterators.epoch_tracking_bucket_iterator
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/tests/data/iterators/epoch_tracking_bucket_iterator_test.py b/tests/data/iterators/epoch_tracking_bucket_iterator_test.py
@@ -0,0 +1,27 @@
+from allennlp.data.iterators import EpochTrackingBucketIterator
+from tests.data.iterators.basic_iterator_test import IteratorTest
+
+
+class EpochTrackingBucketIteratorTest(IteratorTest):
+    def setUp(self):
+        # The super class creates a self.instances field and populates it with some instances with
+        # TextFields.
+        super(EpochTrackingBucketIteratorTest, self).setUp()
+        self.iterator = EpochTrackingBucketIterator(sorting_keys=[["text", "num_tokens"]])
+        # We'll add more to create a second dataset.
+        self.more_instances = [
+                self.create_instance(["this", "is", "a", "sentence"]),
+                self.create_instance(["this", "is", "in", "the", "second", "dataset"]),
+                self.create_instance(["so", "is", "this", "one"])
+                ]
+
+    def test_iterator_tracks_epochs_per_dataset(self):
+        generated_dataset1 = list(self.iterator(self.instances, num_epochs=2))
+        generated_dataset2 = list(self.iterator(self.more_instances, num_epochs=2))
+
+        # First dataset has five sentences. See ``IteratorTest.setUp``
+        assert generated_dataset1[0]["epoch_num"] == [0, 0, 0, 0, 0]
+        assert generated_dataset1[1]["epoch_num"] == [1, 1, 1, 1, 1]
+        # Second dataset has three sentences.
+        assert generated_dataset2[0]["epoch_num"] == [0, 0, 0]
+        assert generated_dataset2[1]["epoch_num"] == [1, 1, 1]