huggingface · ArthurZucker · Jul 11, 2023 · Jul 2, 2023 · Jul 2, 2023 · Jul 2, 2023
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -22,10 +22,22 @@
 import warnings
 from typing import Dict, List, Tuple
 
+from packaging import version
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece
 
-from .utils import requires_backends
+from .utils import is_protobuf_available, requires_backends
+
+
+def import_protobuf():
+    if is_protobuf_available():
+        import google.protobuf
+
+        if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
+            from transformers.utils import sentencepiece_model_pb2
+        else:
+            from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
+    return sentencepiece_model_pb2
 
 
 class SentencePieceExtractor:
@@ -445,7 +457,8 @@ def __init__(self, *args):
 
         super().__init__(*args)
 
-        from .utils import sentencepiece_model_pb2 as model_pb2
+        # from .utils import sentencepiece_model_pb2 as model_pb2
+        model_pb2 = import_protobuf()
 
         m = model_pb2.ModelProto()
         with open(self.original_tokenizer.vocab_file, "rb") as f:
@@ -1146,9 +1159,9 @@ def tokenizer(self, proto):
             )
             tokenizer.add_special_tokens(
                 [
-                    AddedToken("<unk>", normalized=False),
-                    AddedToken("<s>", normalized=False),
-                    AddedToken("</s>", normalized=False),
+                    AddedToken("<unk>"),
+                    AddedToken("<s>"),
+                    AddedToken("</s>"),
                 ]
             )
         else:

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
@@ -101,6 +101,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,

diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
@@ -44,6 +44,7 @@
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     "hf-internal-testing/llama-tokenizer": 2048,
 }
+SPIECE_UNDERLINE = "▁"
 
 
 class LlamaTokenizer(PreTrainedTokenizer):
@@ -53,6 +54,29 @@ class LlamaTokenizer(PreTrainedTokenizer):
     Args:
         vocab_file (`str`):
             Path to the vocabulary file.
+        legacy (`bool`, *optional*, defaults to `True`):
+            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
+            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
+            more details.
+
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -71,6 +95,7 @@ def __init__(
         add_bos_token=True,
         add_eos_token=False,
         clean_up_tokenization_spaces=False,
+        legacy=True,
         **kwargs,
     ):
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
@@ -87,8 +112,15 @@ def __init__(
             add_eos_token=add_eos_token,
             sp_model_kwargs=self.sp_model_kwargs,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            legacy=legacy,
             **kwargs,
         )
+        if legacy:
+            logger.warning_once(
+                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+            )
+        self.legacy = legacy
         self.vocab_file = vocab_file
         self.add_bos_token = add_bos_token
         self.add_eos_token = add_eos_token
@@ -117,9 +149,35 @@ def get_vocab(self):
         vocab.update(self.added_tokens_encoder)
         return vocab
 
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text, **kwargs) -> List[str]:
+        # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+        # the beginning of the text
+        if not self.legacy:
+            text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
+        return super().tokenize(text, **kwargs)
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
     def _tokenize(self, text):
-        """Returns a tokenized string."""
-        return self.sp_model.encode(text, out_type=str)
+        """
+        Returns a tokenized string.
+
+        Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
+        we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
+        function is called with specials tokens: the input is split on the special tokens, and each subsequence is
+        passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
+        the extra `SPIECE_UNDERLINE` prepended.
+        """
+        if not self.legacy:
+            is_first = text.startswith(SPIECE_UNDERLINE)
+            if is_first:
+                text = text[1:]
+
+        tokens = self.sp_model.encode(text, out_type=str)
+
+        if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
+        return tokens
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""

diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
@@ -106,6 +106,28 @@ class T5Tokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
+        legacy (`bool`, *optional*, defaults to `True`):
+            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
+            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
+
+            - `legacy=True`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
+            >>> tokenizer.encode("Hello <extra_id_0>.")
+            [8774, 32099, 3, 5, 1]
+            ```
+            - `legacy=False`:
+            ```python
+            >>> from transformers import T5Tokenizer
+
+            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
+            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
+            [8774, 32099, 5, 1]
+            ```
+            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
+            more details.
 
     Attributes:
         sp_model (`SentencePieceProcessor`):
@@ -126,6 +148,7 @@ def __init__(
         extra_ids=100,
         additional_special_tokens=None,
         sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        legacy=True,
         **kwargs,
     ) -> None:
         # Add extra_ids to the special token list
@@ -140,7 +163,13 @@ def __init__(
                     " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
                     " tokens"
                 )
+        if legacy:
+            logger.warning_once(
+                f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
+                " read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
+            )
 
+        self.legacy = legacy
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
 
         super().__init__(
@@ -150,6 +179,7 @@ def __init__(
             extra_ids=extra_ids,
             additional_special_tokens=additional_special_tokens,
             sp_model_kwargs=self.sp_model_kwargs,
+            legacy=legacy,
             **kwargs,
         )
 
@@ -301,15 +331,31 @@ def __setstate__(self, d):
         self.sp_model.Load(self.vocab_file)
 
     def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
-        if not text.startswith(" "):
-            text = " " + text
+        # Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
+        # the beginning of the text
+        if not self.legacy:
+            text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
         return super().tokenize(text, **kwargs)
 
-    def _tokenize(self, text: str) -> List[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
+        we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
+        function is called with specials tokens: the input is split on the special tokens, and each subsequence is
+        passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
+        the extra `SPIECE_UNDERLINE` prepended.
+        """
+        if not self.legacy:
+            is_first = text.startswith(SPIECE_UNDERLINE)
+            if is_first:
+                text = text[1:]
+
         tokens = self.sp_model.encode(text, out_type=str)
-        if not text.startswith(" ") and tokens[0] == SPIECE_UNDERLINE:
-            tokens = tokens[1:]
+
+        if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
+            tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
         return tokens
 
     def _convert_token_to_id(self, token):

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
@@ -77,6 +77,7 @@
     is_safetensors_available,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_soundfile_availble,
     is_spacy_available,
     is_sudachi_available,
@@ -442,6 +443,13 @@ def require_sentencepiece(test_case):
     return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)
 
 
+def require_seqio(test_case):
+    """
+    Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
+    """
+    return unittest.skipUnless(is_seqio_available(), "test requires Seqio")(test_case)
+
+
 def require_scipy(test_case):
     """
     Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed.

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
@@ -142,6 +142,7 @@
     is_sagemaker_mp_enabled,
     is_scipy_available,
     is_sentencepiece_available,
+    is_seqio_available,
     is_sklearn_available,
     is_soundfile_availble,
     is_spacy_available,
@@ -177,15 +178,6 @@
 )
 
 
-if is_protobuf_available():
-    import google.protobuf
-
-    if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
-        from . import sentencepiece_model_pb2
-    else:
-        from . import sentencepiece_model_pb2_new as sentencepiece_model_pb2
-
-
 WEIGHTS_NAME = "pytorch_model.bin"
 WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
 ADAPTER_CONFIG_NAME = "adapter_config.json"

diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
@@ -112,6 +112,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _safetensors_available = _is_package_available("safetensors")
 _scipy_available = _is_package_available("scipy")
 _sentencepiece_available = _is_package_available("sentencepiece")
+_is_seqio_available = _is_package_available("seqio")
 _sklearn_available = importlib.util.find_spec("sklearn") is not None
 if _sklearn_available:
     try:
@@ -507,6 +508,10 @@ def is_sentencepiece_available():
     return _sentencepiece_available
 
 
+def is_seqio_available():
+    return _is_seqio_available
+
+
 def is_protobuf_available():
     if importlib.util.find_spec("google") is None:
         return False