Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
Data cleanup and additions from the wikitables branch (#1084)
Browse files Browse the repository at this point in the history
* Data cleanup and additions from the wikitables branch

* Added docs
  • Loading branch information
matt-gardner authored Apr 13, 2018
1 parent 6829edb commit 78dc1df
Show file tree
Hide file tree
Showing 10 changed files with 147 additions and 25 deletions.
9 changes: 5 additions & 4 deletions allennlp/data/dataset_readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@
and produce datasets in the formats required by specific models.
"""

# pylint: disable=line-too-long
from allennlp.data.dataset_readers.conll2003 import Conll2003DatasetReader
from allennlp.data.dataset_readers.coreference_resolution import ConllCorefReader, WinobiasReader
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.dataset_readers.language_modeling import LanguageModelingReader
from allennlp.data.dataset_readers.penn_tree_bank import PennTreeBankConstituencySpanDatasetReader
from allennlp.data.dataset_readers.reading_comprehension import SquadReader, TriviaQaReader
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
from allennlp.data.dataset_readers.snli import SnliReader
from allennlp.data.dataset_readers.semantic_role_labeling import SrlReader
from allennlp.data.dataset_readers.seq2seq import Seq2SeqDatasetReader
from allennlp.data.dataset_readers.coreference_resolution import ConllCorefReader, WinobiasReader
from allennlp.data.dataset_readers.penn_tree_bank import PennTreeBankConstituencySpanDatasetReader
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
from allennlp.data.dataset_readers.snli import SnliReader
from allennlp.data.dataset_readers.stanford_sentiment_tree_bank import (
StanfordSentimentTreeBankDatasetReader)
9 changes: 5 additions & 4 deletions allennlp/data/fields/label_field.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@ class LabelField(Field[torch.Tensor]):
If your labels are 0-indexed integers, you can pass in this flag, and we'll skip the indexing
step. If this is ``False`` and your labels are not strings, this throws a ``ConfigurationError``.
"""
# It is possible that users want to use this field with a namespace which uses OOV/PAD tokens.
# This warning will be repeated for every instantiation of this class (i.e for every data
# instance), spewing a lot of warnings so this class variable is used to only log a single
# warning per namespace.
# Most often, you probably don't want to have OOV/PAD tokens with a LabelField, so we warn you
# about it when you pick a namespace that will getting these tokens by default. It is
# possible, however, that you _do_ actually want OOV/PAD tokens with this Field. This class
# variable is used to make sure that we only log a single warning for this per namespace, and
# not every time you create one of these Field objects.
_already_warned_namespaces: Set[str] = set()

def __init__(self,
Expand Down
3 changes: 2 additions & 1 deletion allennlp/data/iterators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from allennlp.data.iterators.data_iterator import DataIterator
from allennlp.data.iterators.adaptive_iterator import AdaptiveIterator
from allennlp.data.iterators.basic_iterator import BasicIterator
from allennlp.data.iterators.bucket_iterator import BucketIterator
from allennlp.data.iterators.adaptive_iterator import AdaptiveIterator
from allennlp.data.iterators.epoch_tracking_bucket_iterator import EpochTrackingBucketIterator
77 changes: 77 additions & 0 deletions allennlp/data/iterators/epoch_tracking_bucket_iterator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging
from typing import List, Tuple, Dict, Iterable, Generator, Union
from collections import defaultdict

from overrides import overrides
import numpy

from allennlp.data.fields import MetadataField
from allennlp.data.instance import Instance
from allennlp.data.iterators.data_iterator import DataIterator
from allennlp.data.iterators.bucket_iterator import BucketIterator

logger = logging.getLogger(__name__) # pylint: disable=invalid-name


@DataIterator.register("epoch_tracking_bucket")
class EpochTrackingBucketIterator(BucketIterator):
"""
This is essentially a :class:`allennlp.data.iterators.BucketIterator` with just one difference.
It keeps track of the epoch number, and adds that as an additional meta field to each instance.
That way, ``Model.forward`` will have access to this information. We do this by keeping track of
epochs globally, and incrementing them whenever the iterator is called. However, the iterator is
called both for training and validation sets. So, we keep a dict of epoch numbers, one key per
dataset.
Parameters
----------
See :class:`BucketIterator`.
"""
def __init__(self,
sorting_keys: List[Tuple[str, str]],
padding_noise: float = 0.1,
biggest_batch_first: bool = False,
batch_size: int = 32,
instances_per_epoch: int = None,
max_instances_in_memory: int = None) -> None:
super(EpochTrackingBucketIterator, self).__init__(sorting_keys=sorting_keys,
padding_noise=padding_noise,
biggest_batch_first=biggest_batch_first,
batch_size=batch_size,
instances_per_epoch=instances_per_epoch,
max_instances_in_memory=max_instances_in_memory)
# Epoch number value per dataset.
self._global_epoch_nums: Dict[int, int] = defaultdict(int)

@overrides
def __call__(self,
instances: Iterable[Instance],
num_epochs: int = None,
shuffle: bool = True,
cuda_device: int = -1,
for_training: bool = True) -> Generator[Dict[str, Union[numpy.ndarray,
Dict[str, numpy.ndarray]]],
None, None]:
"""
See ``DataIterator.__call__`` for parameters.
"""
dataset_id = id(instances)
if num_epochs is None:
while True:
self._add_epoch_num_to_instances(instances, dataset_id)
yield from self._yield_one_epoch(instances, shuffle, cuda_device, for_training)
self._global_epoch_nums[dataset_id] += 1
else:
for _ in range(num_epochs):
self._add_epoch_num_to_instances(instances, dataset_id)
yield from self._yield_one_epoch(instances, shuffle, cuda_device, for_training)
self._global_epoch_nums[dataset_id] += 1

def _add_epoch_num_to_instances(self,
instances: Iterable[Instance],
dataset_id: int) -> None:
for instance in instances:
# TODO(pradeep): Mypy complains here most probably because ``fields`` is typed as a
# ``Mapping``, and not a ``Dict``. Ignoring this for now, but the type of fields
# probably needs to be changed.
instance.fields["epoch_num"] = MetadataField(self._global_epoch_nums[dataset_id]) #type: ignore
8 changes: 6 additions & 2 deletions allennlp/data/tokenizers/token.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
class Token:
"""
A simple token representation, keeping track of the token's text, offset in the passage it was
taken from, POS tag, and dependency relation. These fields match spacy's exactly, so we can
just use a spacy token for this.
taken from, POS tag, dependency relation, and similar information. These fields match spacy's
exactly, so we can just use a spacy token for this.
Parameters
----------
text : ``str``, optional
The original text represented by this token.
idx : ``int``, optional
The character offset of this token into the tokenized passage.
lemma : ``str``, optional
The lemma of this token.
pos : ``str``, optional
The coarse-grained part of speech of this token.
tag : ``str``, optional
Expand All @@ -31,13 +33,15 @@ class Token:
def __init__(self,
text: str = None,
idx: int = None,
lemma: str = None,
pos: str = None,
tag: str = None,
dep: str = None,
ent_type: str = None,
text_id: int = None) -> None:
self.text = text
self.idx = idx
self.lemma_ = lemma
self.pos_ = pos
self.tag_ = tag
self.dep_ = dep
Expand Down
9 changes: 8 additions & 1 deletion allennlp/data/tokenizers/word_stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,11 @@ def __init__(self):
@overrides
def stem_word(self, word: Token) -> Token:
new_text = self.stemmer.stem(word.text)
return Token(new_text, word.idx, word.pos_, word.tag_, word.dep_, getattr(word, 'text_id', None))
return Token(text=new_text,
idx=word.idx,
lemma=word.lemma_,
pos=word.pos_,
tag=word.tag_,
dep=word.dep_,
ent_type=word.ent_type_,
text_id=getattr(word, 'text_id', None))
13 changes: 0 additions & 13 deletions allennlp/data/tokenizers/word_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,6 @@ class WordTokenizer(Tokenizer):
If given, these tokens will be added to the beginning of every string we tokenize.
end_tokens : ``List[str]``, optional
If given, these tokens will be added to the end of every string we tokenize.
language : ``str``, optional
We use spacy to tokenize strings; this option specifies which language to use. By default
we use English.
pos_tags : ``bool``, optional
By default we do not load spacy's tagging model, to save loading time and memory. Set this
to ``True`` if you want to have access to spacy's POS tags in the returned tokens.
parse : ``bool``, optional
By default we do not load spacy's parsing model, to save loading time and memory. Set this
to ``True`` if you want to have access to spacy's dependency parse tags in the returned
tokens.
ner : ``bool``, optional
By default we do not load spacy's parsing model, to save loading time and memory. Set this
to ``True`` if you want to have access to spacy's NER tags in the returned tokens.
"""
def __init__(self,
word_splitter: WordSplitter = None,
Expand Down
10 changes: 10 additions & 0 deletions allennlp/data/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def _read_pretrained_words(embeddings_filename: str)-> Set[str]:
words.add(word)
return words


class Vocabulary:
"""
A Vocabulary maps strings to integers, allowing for strings to be mapped to an
Expand Down Expand Up @@ -382,6 +383,12 @@ def from_params(cls, params: Params, instances: Iterable['adi.Instance'] = None)
pretrained_files=pretrained_files,
only_include_pretrained_words=only_include_pretrained_words)

def is_padded(self, namespace: str) -> bool:
"""
Returns whether or not there are padding and OOV tokens added to the given namepsace.
"""
return self._index_to_token[namespace][0] == self._padding_token

def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
"""
Adds ``token`` to the index, if it is not already present. Either way, we return the index of
Expand All @@ -401,6 +408,9 @@ def add_token_to_namespace(self, token: str, namespace: str = 'tokens') -> int:
def get_index_to_token_vocabulary(self, namespace: str = 'tokens') -> Dict[int, str]:
return self._index_to_token[namespace]

def get_token_to_index_vocabulary(self, namespace: str = 'tokens') -> Dict[str, int]:
return self._token_to_index[namespace]

def get_token_index(self, token: str, namespace: str = 'tokens') -> int:
if token in self._token_to_index[namespace]:
return self._token_to_index[namespace][token]
Expand Down
7 changes: 7 additions & 0 deletions doc/api/allennlp.data.iterators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ allennlp.data.iterators
* :ref:`AdaptiveIterator<adaptive-iterator>`
* :ref:`BasicIterator<basic-iterator>`
* :ref:`BucketIterator<bucket-iterator>`
* :ref:`EpochTrackingBucketIterator<epoch-tracking-bucket-iterator>`

.. _data-iterator:
.. automodule:: allennlp.data.iterators.data_iterator
Expand All @@ -34,3 +35,9 @@ allennlp.data.iterators
:members:
:undoc-members:
:show-inheritance:

.. _epoch-tracking-bucket-iterator:
.. automodule:: allennlp.data.iterators.epoch_tracking_bucket_iterator
:members:
:undoc-members:
:show-inheritance:
27 changes: 27 additions & 0 deletions tests/data/iterators/epoch_tracking_bucket_iterator_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from allennlp.data.iterators import EpochTrackingBucketIterator
from tests.data.iterators.basic_iterator_test import IteratorTest


class EpochTrackingBucketIteratorTest(IteratorTest):
def setUp(self):
# The super class creates a self.instances field and populates it with some instances with
# TextFields.
super(EpochTrackingBucketIteratorTest, self).setUp()
self.iterator = EpochTrackingBucketIterator(sorting_keys=[["text", "num_tokens"]])
# We'll add more to create a second dataset.
self.more_instances = [
self.create_instance(["this", "is", "a", "sentence"]),
self.create_instance(["this", "is", "in", "the", "second", "dataset"]),
self.create_instance(["so", "is", "this", "one"])
]

def test_iterator_tracks_epochs_per_dataset(self):
generated_dataset1 = list(self.iterator(self.instances, num_epochs=2))
generated_dataset2 = list(self.iterator(self.more_instances, num_epochs=2))

# First dataset has five sentences. See ``IteratorTest.setUp``
assert generated_dataset1[0]["epoch_num"] == [0, 0, 0, 0, 0]
assert generated_dataset1[1]["epoch_num"] == [1, 1, 1, 1, 1]
# Second dataset has three sentences.
assert generated_dataset2[0]["epoch_num"] == [0, 0, 0]
assert generated_dataset2[1]["epoch_num"] == [1, 1, 1]

0 comments on commit 78dc1df

Please sign in to comment.