revert pegasus-tokenizer back

huggingface · patrickvonplaten · May 7, 2021 · Mar 30, 2021 · Apr 2, 2021 · Apr 2, 2021
commit c403796204293bcef9450cc0dd56b038af60365b
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -41,10 +41,8 @@
 class PegasusTokenizer(PreTrainedTokenizer):
     r"""
     Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
-
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
-
     Args:
         vocab_file (:obj:`str`):
             `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
@@ -53,9 +51,7 @@ class PegasusTokenizer(PreTrainedTokenizer):
             The token used for padding, for example when batching sequences of different lengths.
         eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
             The end of sequence token.
-
             .. note::
-
                 When building a sequence using special tokens, this is not the token that is used for the end of
                 sequence. The token used is the :obj:`sep_token`.
         unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
@@ -79,6 +75,9 @@ class PegasusTokenizer(PreTrainedTokenizer):
             that uses the tokens 2 - 104 only for pretraining
     """
     vocab_files_names = VOCAB_FILES_NAMES
+
+    offset = 103  # entries 2 - 104 are only used for pretraining
+    vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     model_input_names = ["input_ids", "attention_mask"]
@@ -92,11 +91,8 @@ def __init__(
         mask_token="<mask_2>",
         mask_token_sent="<mask_1>",
         additional_special_tokens=None,
-        offset=103,
         **kwargs
     ):
-        self.offset = offset  # entries 2 - 104 are only used for pretraining by default
-
         if additional_special_tokens is not None:
             assert isinstance(
                 additional_special_tokens, list
@@ -128,7 +124,6 @@ def __init__(
             pad_token=pad_token,
             mask_token_sent=mask_token_sent,
             additional_special_tokens=additional_special_tokens,
-            offset=self.offset,
             **kwargs,
         )
         self.vocab_file = vocab_file
@@ -228,19 +223,15 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> Lis
         """
         Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
         and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
-
         - single sequence: ``X </s>``
         - pair of sequences: ``A B </s>`` (not intended use)
-
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
-
         Args:
             token_ids_0 (:obj:`List[int]`):
                 List of IDs to which the special tokens will be added.
             token_ids_1 (:obj:`List[int]`, `optional`):
                 Optional second list of IDs for sequence pairs.
-
         Returns:
             :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
         """
@@ -260,4 +251,4 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
             copyfile(self.vocab_file, out_vocab_file)
 
-        return (out_vocab_file,)
+        return (out_vocab_file,)