Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Commit

Permalink
remove spaces in SpacyWordSplitter.batch_tokenize (#1156)
Browse files Browse the repository at this point in the history
  • Loading branch information
joelgrus authored May 1, 2018
1 parent d4b286c commit 60e03e3
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
8 changes: 6 additions & 2 deletions allennlp/data/tokenizers/word_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List

from overrides import overrides
import spacy

from allennlp.common import Params, Registrable
from allennlp.common.util import get_spacy_model
Expand Down Expand Up @@ -165,6 +166,8 @@ def from_params(cls, params: Params) -> 'WordSplitter':
params.assert_empty(cls.__name__)
return cls()

def _remove_spaces(tokens: List[spacy.tokens.Token]) -> List[spacy.tokens.Token]:
return [token for token in tokens if not token.is_space]

@WordSplitter.register('spacy')
class SpacyWordSplitter(WordSplitter):
Expand All @@ -181,12 +184,13 @@ def __init__(self,

@overrides
def batch_split_words(self, sentences: List[str]) -> List[List[Token]]:
return self.spacy.pipe(sentences, n_threads=-1)
return [_remove_spaces(tokens)
for tokens in self.spacy.pipe(sentences, n_threads=-1)]

@overrides
def split_words(self, sentence: str) -> List[Token]:
# This works because our Token class matches spacy's.
return [t for t in self.spacy(sentence) if not t.is_space]
return _remove_spaces(self.spacy(sentence))

@classmethod
def from_params(cls, params: Params) -> 'WordSplitter':
Expand Down
13 changes: 13 additions & 0 deletions tests/data/tokenizers/word_splitter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,16 @@ def test_tokenize_handles_special_cases(self):
"e.g.", ",", "the", "store"]
tokens = [t.text for t in self.word_splitter.split_words(sentence)]
assert tokens == expected_tokens

def test_batch_tokenization(self):
sentences = ["This is a sentence",
"This isn't a sentence.",
"This is the 3rd sentence."
"Here's the 'fourth' sentence."]
batch_split = self.word_splitter.batch_split_words(sentences)
separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences]
assert len(batch_split) == len(separately_split)
for batch_sentence, separate_sentence in zip(batch_split, separately_split):
assert len(batch_sentence) == len(separate_sentence)
for batch_word, separate_word in zip(batch_sentence, separate_sentence):
assert batch_word.text == separate_word.text

0 comments on commit 60e03e3

Please sign in to comment.