remove spaces in SpacyWordSplitter.batch_tokenize (#1156)

allenai · May 1, 2018 · 60e03e3 · 60e03e3
1 parent d4b286c
commit 60e03e3
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 2 deletions.
diff --git a/allennlp/data/tokenizers/word_splitter.py b/allennlp/data/tokenizers/word_splitter.py
@@ -2,6 +2,7 @@
 from typing import List
 
 from overrides import overrides
+import spacy
 
 from allennlp.common import Params, Registrable
 from allennlp.common.util import get_spacy_model
@@ -165,6 +166,8 @@ def from_params(cls, params: Params) -> 'WordSplitter':
         params.assert_empty(cls.__name__)
         return cls()
 
+def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
+    return [token for token in tokens if not token.is_space]
 
 @WordSplitter.register('spacy')
 class SpacyWordSplitter(WordSplitter):
@@ -181,12 +184,13 @@ def __init__(self,
 
     @overrides
     def batch_split_words(self, sentences: List[str]) -> List[List[Token]]:
-        return self.spacy.pipe(sentences, n_threads=-1)
+        return [_remove_spaces(tokens)
+                for tokens in self.spacy.pipe(sentences, n_threads=-1)]
 
     @overrides
     def split_words(self, sentence: str) -> List[Token]:
         # This works because our Token class matches spacy's.
-        return [t for t in self.spacy(sentence) if not t.is_space]
+        return _remove_spaces(self.spacy(sentence))
 
     @classmethod
     def from_params(cls, params: Params) -> 'WordSplitter':

diff --git a/tests/data/tokenizers/word_splitter_test.py b/tests/data/tokenizers/word_splitter_test.py
@@ -136,3 +136,16 @@ def test_tokenize_handles_special_cases(self):
                            "e.g.", ",", "the", "store"]
         tokens = [t.text for t in self.word_splitter.split_words(sentence)]
         assert tokens == expected_tokens
+
+    def test_batch_tokenization(self):
+        sentences = ["This is     a sentence",
+                     "This isn't a sentence.",
+                     "This is the 3rd     sentence."
+                     "Here's the 'fourth' sentence."]
+        batch_split = self.word_splitter.batch_split_words(sentences)
+        separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences]
+        assert len(batch_split) == len(separately_split)
+        for batch_sentence, separate_sentence in zip(batch_split, separately_split):
+            assert len(batch_sentence) == len(separate_sentence)
+            for batch_word, separate_word in zip(batch_sentence, separate_sentence):
+                assert batch_word.text == separate_word.text