Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Patch-t5-tokenizer] Patches the changes on T5 to make sure previous behaviour is still valide for beginning of words #24622

Merged
merged 59 commits into from
Jul 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
e59ec38
patch `_tokenize` function
ArthurZucker Jul 2, 2023
e31ac3a
more tests
ArthurZucker Jul 2, 2023
0384cd8
properly fix
ArthurZucker Jul 2, 2023
d360308
fixup
ArthurZucker Jul 2, 2023
5cc0c9d
Update src/transformers/models/t5/tokenization_t5.py
ArthurZucker Jul 3, 2023
adfd99d
Merge branch 'main' of https://github.com/huggingface/transformers in…
ArthurZucker Jul 3, 2023
7df5038
pouMerge branch 'patch-t5-tokenizer' of https://github.com/ArthurZuck…
ArthurZucker Jul 3, 2023
57995f9
fix without ifs
ArthurZucker Jul 4, 2023
aa9455f
update
ArthurZucker Jul 4, 2023
b26d29e
protect import
ArthurZucker Jul 4, 2023
6ed0617
add python processing
ArthurZucker Jul 4, 2023
b558acd
is first needed
ArthurZucker Jul 4, 2023
df58420
add doc and update with lefacy
ArthurZucker Jul 4, 2023
c20b6ba
updaate
ArthurZucker Jul 4, 2023
b7a7776
fix T5 SPM converter
ArthurZucker Jul 4, 2023
347236e
styling
ArthurZucker Jul 4, 2023
85c5894
fix T5 warning
ArthurZucker Jul 4, 2023
4845533
add is_seqio_available
ArthurZucker Jul 4, 2023
6ac18bc
remove is_first
ArthurZucker Jul 4, 2023
26ff9f1
revert some changes
ArthurZucker Jul 4, 2023
ffbc8a6
more tests and update
ArthurZucker Jul 4, 2023
3bdc83e
update llama test batterie
ArthurZucker Jul 4, 2023
5edfb39
fixup
ArthurZucker Jul 4, 2023
a55dabf
refactor T5 spm common tests
ArthurZucker Jul 4, 2023
de889cb
draft the llama tests
ArthurZucker Jul 4, 2023
aaf8610
update
ArthurZucker Jul 5, 2023
bfcb85b
uopdate test
ArthurZucker Jul 5, 2023
181981c
nits
ArthurZucker Jul 5, 2023
c75104b
refine
ArthurZucker Jul 6, 2023
c745713
name nit
ArthurZucker Jul 6, 2023
7c71391
fix t5 tests
ArthurZucker Jul 6, 2023
1578640
fix T5
ArthurZucker Jul 6, 2023
38892b6
update
ArthurZucker Jul 6, 2023
7aaaec1
revert convert slow to fast changes that fail lots of tests
ArthurZucker Jul 6, 2023
c4d71df
legacy support
ArthurZucker Jul 6, 2023
0c5e48f
fixup
ArthurZucker Jul 6, 2023
77c9191
nits is first not defined
ArthurZucker Jul 6, 2023
09f351e
don't use legacy behaviour for switch transformers
ArthurZucker Jul 6, 2023
0609ee7
style
ArthurZucker Jul 6, 2023
10fa693
My attempt to check.
Narsil Jul 6, 2023
1158298
nits
ArthurZucker Jul 7, 2023
557db8e
fixes
ArthurZucker Jul 7, 2023
4761837
update
ArthurZucker Jul 7, 2023
cce91ee
Merge branch 'main' into patch-t5-tokenizer
ArthurZucker Jul 7, 2023
dad0c66
fixup
ArthurZucker Jul 7, 2023
4217843
Apply suggestions from code review
ArthurZucker Jul 10, 2023
62a6eb5
updates
ArthurZucker Jul 10, 2023
1c209a7
fixup
ArthurZucker Jul 10, 2023
d540e2e
add legacy warning
ArthurZucker Jul 11, 2023
5d17a46
fixup
ArthurZucker Jul 11, 2023
ea057ca
warning_once nit
ArthurZucker Jul 11, 2023
071b87a
update t5 documentation test
ArthurZucker Jul 11, 2023
2fdcf09
update llama tok documentation
ArthurZucker Jul 11, 2023
3cd1f29
add space to warning
ArthurZucker Jul 11, 2023
ae31daa
nits
ArthurZucker Jul 11, 2023
9510908
nit
ArthurZucker Jul 11, 2023
23ab157
Apply suggestions from code review
ArthurZucker Jul 11, 2023
898c057
last nits
ArthurZucker Jul 11, 2023
d6170ba
Merge branch 'patch-t5-tokenizer' of https://github.com/ArthurZucker/…
ArthurZucker Jul 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 18 additions & 5 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,22 @@
import warnings
from typing import Dict, List, Tuple

from packaging import version
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
from tokenizers.models import BPE, Unigram, WordPiece

from .utils import requires_backends
from .utils import is_protobuf_available, requires_backends


def import_protobuf():
if is_protobuf_available():
import google.protobuf

if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
from transformers.utils import sentencepiece_model_pb2
else:
from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2
return sentencepiece_model_pb2


class SentencePieceExtractor:
Expand Down Expand Up @@ -445,7 +457,8 @@ def __init__(self, *args):

super().__init__(*args)

from .utils import sentencepiece_model_pb2 as model_pb2
# from .utils import sentencepiece_model_pb2 as model_pb2
model_pb2 = import_protobuf()

m = model_pb2.ModelProto()
with open(self.original_tokenizer.vocab_file, "rb") as f:
Expand Down Expand Up @@ -1146,9 +1159,9 @@ def tokenizer(self, proto):
)
tokenizer.add_special_tokens(
[
AddedToken("<unk>", normalized=False),
AddedToken("<s>", normalized=False),
AddedToken("</s>", normalized=False),
AddedToken("<unk>"),
AddedToken("<s>"),
AddedToken("</s>"),
]
)
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
is_sagemaker_mp_enabled,
is_scipy_available,
is_sentencepiece_available,
is_seqio_available,
is_sklearn_available,
is_soundfile_availble,
is_spacy_available,
Expand Down
62 changes: 60 additions & 2 deletions src/transformers/models/llama/tokenization_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"hf-internal-testing/llama-tokenizer": 2048,
}
SPIECE_UNDERLINE = "▁"


class LlamaTokenizer(PreTrainedTokenizer):
Expand All @@ -53,6 +54,29 @@ class LlamaTokenizer(PreTrainedTokenizer):
Args:
vocab_file (`str`):
Path to the vocabulary file.
legacy (`bool`, *optional*, defaults to `True`):
Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
which includes fixes to properly handle tokens that appear after special tokens. A simple example:
- `legacy=True`:
```python
>>> from transformers import T5Tokenizer
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
>>> tokenizer.encode("Hello <extra_id_0>.")
[8774, 32099, 3, 5, 1]
```
- `legacy=False`:
```python
>>> from transformers import T5Tokenizer
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
more details.
"""

vocab_files_names = VOCAB_FILES_NAMES
Expand All @@ -71,6 +95,7 @@ def __init__(
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
legacy=True,
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
**kwargs,
):
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
Expand All @@ -87,8 +112,15 @@ def __init__(
add_eos_token=add_eos_token,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
legacy=legacy,
**kwargs,
)
if legacy:
logger.warning_once(
f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
" read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
)
self.legacy = legacy
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
Expand Down Expand Up @@ -117,9 +149,35 @@ def get_vocab(self):
vocab.update(self.added_tokens_encoder)
return vocab

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
def tokenize(self, text, **kwargs) -> List[str]:
# Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
# the beginning of the text
if not self.legacy:
text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
return super().tokenize(text, **kwargs)

# Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
"""
Returns a tokenized string.
Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
function is called with specials tokens: the input is split on the special tokens, and each subsequence is
passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
the extra `SPIECE_UNDERLINE` prepended.
"""
if not self.legacy:
is_first = text.startswith(SPIECE_UNDERLINE)
if is_first:
text = text[1:]

tokens = self.sp_model.encode(text, out_type=str)

if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
return tokens

def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
Expand Down
58 changes: 52 additions & 6 deletions src/transformers/models/t5/tokenization_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,28 @@ class T5Tokenizer(PreTrainedTokenizer):
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
legacy (`bool`, *optional*, defaults to `True`):
Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
ArthurZucker marked this conversation as resolved.
Show resolved Hide resolved
which includes fixes to properly handle tokens that appear after special tokens. A simple example:
- `legacy=True`:
```python
>>> from transformers import T5Tokenizer
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
>>> tokenizer.encode("Hello <extra_id_0>.")
[8774, 32099, 3, 5, 1]
```
- `legacy=False`:
```python
>>> from transformers import T5Tokenizer
>>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
>>> tokenizer.encode("Hello <extra_id_0>.") # the extra space `[3]` is no longer here
[8774, 32099, 5, 1]
```
Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
more details.
Attributes:
sp_model (`SentencePieceProcessor`):
Expand All @@ -126,6 +148,7 @@ def __init__(
extra_ids=100,
additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
legacy=True,
**kwargs,
) -> None:
# Add extra_ids to the special token list
Expand All @@ -140,7 +163,13 @@ def __init__(
" provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
" tokens"
)
if legacy:
logger.warning_once(
f"You are using the legacy behaviour of the {self.__class__}. This means that tokens that come after special tokens will not be properly handled. We recommend you to"
" read the related pull request available at https://github.com/huggingface/transformers/pull/24565"
)

self.legacy = legacy
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

super().__init__(
Expand All @@ -150,6 +179,7 @@ def __init__(
extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
legacy=legacy,
**kwargs,
)

Expand Down Expand Up @@ -301,15 +331,31 @@ def __setstate__(self, d):
self.sp_model.Load(self.vocab_file)

def tokenize(self, text: "TextInput", **kwargs) -> List[str]:
if not text.startswith(" "):
text = " " + text
# Replace the SPIECE_UNDERLINE with a space to make sure SPIECE_UNDERLINE is only used at
# the beginning of the text
if not self.legacy:
text = SPIECE_UNDERLINE + text.replace(SPIECE_UNDERLINE, " ")
return super().tokenize(text, **kwargs)

def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
def _tokenize(self, text, **kwargs):
"""
Returns a tokenized string.
Since the sentencepiece internal model always adds a SPIECE_UNDERLINE, at the beginning of the provided text,
we need to remove it by hand when the current text is a subsequence. This happens whenever the `self.tokenize`
function is called with specials tokens: the input is split on the special tokens, and each subsequence is
passed to `_tokenize`. Thus if a subsequence did not start with a `" "` or SPIECE_UNDERLINE, we have to remove
the extra `SPIECE_UNDERLINE` prepended.
"""
if not self.legacy:
is_first = text.startswith(SPIECE_UNDERLINE)
if is_first:
text = text[1:]

tokens = self.sp_model.encode(text, out_type=str)
if not text.startswith(" ") and tokens[0] == SPIECE_UNDERLINE:
tokens = tokens[1:]

if not self.legacy and not is_first and not text.startswith(" ") and tokens[0].startswith(SPIECE_UNDERLINE):
tokens = ([tokens[0][1:]] if len(tokens[0]) > 1 else []) + tokens[1:]
return tokens

def _convert_token_to_id(self, token):
Expand Down
8 changes: 8 additions & 0 deletions src/transformers/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
is_safetensors_available,
is_scipy_available,
is_sentencepiece_available,
is_seqio_available,
is_soundfile_availble,
is_spacy_available,
is_sudachi_available,
Expand Down Expand Up @@ -442,6 +443,13 @@ def require_sentencepiece(test_case):
return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case)


def require_seqio(test_case):
"""
Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed.
"""
return unittest.skipUnless(is_seqio_available(), "test requires Seqio")(test_case)


def require_scipy(test_case):
"""
Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed.
Expand Down
10 changes: 1 addition & 9 deletions src/transformers/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@
is_sagemaker_mp_enabled,
is_scipy_available,
is_sentencepiece_available,
is_seqio_available,
is_sklearn_available,
is_soundfile_availble,
is_spacy_available,
Expand Down Expand Up @@ -177,15 +178,6 @@
)


if is_protobuf_available():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this deleted?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it's messing up the import. By deleting it here, we can at least run the test individually

Copy link
Collaborator

@ydshieh ydshieh Jul 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure what's messed up. With the fix in #24689 , everything should be fine.

Also, moving this to another place might count as breaking change, as

from transformers.utils import sentencepiece_model_pb2

won't work anymore (despite the direct such usage might be low).

See here

Could we bring it back quickly, and you open an issue about what's the import error you got. Thanks!

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a look

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also from transformers.utils import sentencepiece_model_pb2 works for me

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comes from #24690

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that we only import it once, where we actually use it.

Copy link
Collaborator Author

@ArthurZucker ArthurZucker Jul 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not fix the version error, but fixes the issue with 3.20.3, when we cannot use seqio or anything importing protobuf:

! pip install protobuf=3.20.3
from transformers import AutoTokenizer
from seqio import SentencePieceVocabulary

In [3]: from seqio import SentencePieceVocabulary
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 1
----> 1 from seqio import SentencePieceVocabulary

File /opt/conda/envs/py39/lib/python3.9/site-packages/seqio/__init__.py:18
     15 """Import to top-level API."""
     16 # pylint:disable=wildcard-import,g-bad-import-order
---> 18 from seqio.dataset_providers import *
     19 from seqio import evaluation
     20 from seqio import experimental

File /opt/conda/envs/py39/lib/python3.9/site-packages/seqio/dataset_providers.py:39
     37 from packaging import version as version_lib
     38 import pyglove as pg
---> 39 from seqio import metrics as metrics_lib
     40 from seqio import preprocessors as seqio_preprocessors
     41 from seqio import task_registry_provenance_tracking

File /opt/conda/envs/py39/lib/python3.9/site-packages/seqio/metrics.py:27
     25 import jax.numpy as jnp
     26 import numpy as np
---> 27 from seqio import utils
     28 import tensorflow.compat.v2 as tf
     31 @dataclasses.dataclass
     32 class MetricValue:

File /opt/conda/envs/py39/lib/python3.9/site-packages/seqio/utils.py:29
     27 from absl import logging
     28 import numpy as np
---> 29 from seqio.vocabularies import Vocabulary
     30 import tensorflow.compat.v2 as tf
     31 import tensorflow_datasets as tfds

File /opt/conda/envs/py39/lib/python3.9/site-packages/seqio/vocabularies.py:27
     24 import tensorflow.compat.v2 as tf
     25 import tensorflow_text as tf_text
---> 27 from sentencepiece import sentencepiece_model_pb2
     28 import sentencepiece as sentencepiece_processor
     30 PAD_ID = 0

File /opt/conda/envs/py39/lib/python3.9/site-packages/sentencepiece/sentencepiece_model_pb2.py:16
      9 # @@protoc_insertion_point(imports)
     11 _sym_db = _symbol_database.Default()
---> 16 DESCRIPTOR = _descriptor.FileDescriptor(
     17   name='sentencepiece_model.proto',
     18   package='sentencepiece',
     19   syntax='proto2',
     20   serialized_options=b'H\003',
     21   create_key=_descriptor._internal_create_key,
     22   serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\x80\x0c\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12*\n\x1b\x65nable_differential_privacy\x18\x32 \x01(\x08:\x05\x66\x61lse\x12+\n differential_privacy_noise_level\x18\x33 \x01(\x02:\x01\x30\x12\x32\n\'differential_privacy_clipping_threshold\x18\x34 \x01(\x04:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x04:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12+\n\x1c\x61llow_whitespace_only_pieces\x18\x1a \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12#\n\x19pretokenization_delimiter\x18\x35 \x01(\t:\x00\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
     23 )
     27 _TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
     28   name='ModelType',
     29   full_name='sentencepiece.TrainerSpec.ModelType',
   (...)
     58   serialized_end=1570,
     59 )
     60 _sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)

File /opt/conda/envs/py39/lib/python3.9/site-packages/google/protobuf/descriptor.py:1024, in FileDescriptor.__new__(cls, name, package, options, serialized_options, serialized_pb, dependencies, public_dependencies, syntax, pool, create_key)
   1022     raise RuntimeError('Please link in cpp generated lib for %s' % (name))
   1023 elif serialized_pb:
-> 1024   return _message.default_pool.AddSerializedFile(serialized_pb)
   1025 else:
   1026   return super(FileDescriptor, cls).__new__(cls)

TypeError: Couldn't build proto file into descriptor pool!
Invalid proto descriptor for file "sentencepiece_model.proto":
  sentencepiece_model.proto: A file with this name is already in the pool.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, thanks for the information. Could you provide the full trace 🙏 . It might be useful.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just added it!

import google.protobuf

if version.parse(google.protobuf.__version__) < version.parse("4.0.0"):
from . import sentencepiece_model_pb2
else:
from . import sentencepiece_model_pb2_new as sentencepiece_model_pb2


WEIGHTS_NAME = "pytorch_model.bin"
WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
ADAPTER_CONFIG_NAME = "adapter_config.json"
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/utils/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
_safetensors_available = _is_package_available("safetensors")
_scipy_available = _is_package_available("scipy")
_sentencepiece_available = _is_package_available("sentencepiece")
_is_seqio_available = _is_package_available("seqio")
_sklearn_available = importlib.util.find_spec("sklearn") is not None
if _sklearn_available:
try:
Expand Down Expand Up @@ -507,6 +508,10 @@ def is_sentencepiece_available():
return _sentencepiece_available


def is_seqio_available():
return _is_seqio_available


def is_protobuf_available():
if importlib.util.find_spec("google") is None:
return False
Expand Down
Loading