Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Add custom_dummy_tokens to PretrainedTransformerTokenizer #5608

Merged
merged 7 commits into from
Apr 8, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

- Added `custom_dummy_tokens` argument to `TestPretrainedTransformerTokenizer`.
tarohi24 marked this conversation as resolved.
Show resolved Hide resolved

## [v2.9.2](https://github.com/allenai/allennlp/releases/tag/v2.9.2) - 2022-03-21

### Fixed
Expand Down
19 changes: 13 additions & 6 deletions allennlp/data/tokenizers/pretrained_transformer_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class PretrainedTransformerTokenizer(Tokenizer):
Dictionary with
[additional arguments](https://github.com/huggingface/transformers/blob/155c782a2ccd103cf63ad48a2becd7c76a7d2115/transformers/tokenization_utils.py#L691)
for `AutoTokenizer.from_pretrained`.
custom_dummy_tokens: `Tuple[str, str]`, optional (default = `None`)
A pair of tokens having different token IDs. It's used for reverse-engineering special tokens.
""" # noqa: E501

def __init__(
Expand All @@ -53,6 +55,7 @@ def __init__(
add_special_tokens: bool = True,
max_length: Optional[int] = None,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
custom_dummy_tokens: Optional[Tuple[str, str]] = None,
tarohi24 marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
if tokenizer_kwargs is None:
tokenizer_kwargs = {}
Expand All @@ -75,12 +78,16 @@ def __init__(

self._tokenizer_lowercases = self.tokenizer_lowercases(self.tokenizer)

try:
self._reverse_engineer_special_tokens("a", "b", model_name, tokenizer_kwargs)
except AssertionError:
# For most transformer models, "a" and "b" work just fine as dummy tokens. For a few,
# they don't, and so we use "1" and "2" instead.
self._reverse_engineer_special_tokens("1", "2", model_name, tokenizer_kwargs)
if custom_dummy_tokens is None:
try:
self._reverse_engineer_special_tokens("a", "b", model_name, tokenizer_kwargs)
except AssertionError:
# For most transformer models, "a" and "b" work just fine as dummy tokens. For a few,
# they don't, and so we use "1" and "2" instead.
self._reverse_engineer_special_tokens("1", "2", model_name, tokenizer_kwargs)
else:
token_a, token_b = custom_dummy_tokens
self._reverse_engineer_special_tokens(token_a, token_b, model_name, tokenizer_kwargs)

def _reverse_engineer_special_tokens(
self,
Expand Down
19 changes: 19 additions & 0 deletions tests/data/tokenizers/pretrained_transformer_tokenizer_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from typing import Iterable, List

import pytest

from allennlp.common import Params
from allennlp.common.testing import AllenNlpTestCase
from allennlp.data import Token
Expand Down Expand Up @@ -341,3 +343,20 @@ def test_to_params(self):
"max_length": None,
"tokenizer_kwargs": {"max_len": 10, "use_fast": True},
}

def test_initialize_tokenizer_with_custom_dummy_tokens(self):
model_name = "roberta-base"
PretrainedTransformerTokenizer(
model_name,
custom_dummy_tokens=("cat", "dog"),
)
with pytest.raises(AssertionError):
PretrainedTransformerTokenizer(
model_name,
custom_dummy_tokens=("unknowntoken", "dog"),
)
with pytest.raises(AssertionError):
PretrainedTransformerTokenizer(
model_name,
custom_dummy_tokens=("cat", "cat"),
)