From b7b6ea3b6e89fd98ac4083df37f9bc8f937014bc Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 2 Aug 2022 18:08:33 +0200 Subject: [PATCH 1/4] [TENTATIVe] Attempt to reduce number of HEAD calls during model warmup. --- src/transformers/utils/hub.py | 39 ++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 9e81654cda7e1a..40c55be8246e95 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -16,6 +16,7 @@ """ import copy import fnmatch +import functools import io import json import os @@ -26,6 +27,7 @@ import tarfile import tempfile import traceback +import time import warnings from contextlib import contextmanager from functools import partial @@ -456,6 +458,35 @@ def http_get( progress.close() +def timed_cache(ttl): + cache = {} + + def true_decorator(f): + @functools.wraps(f) + def wrapped(url, headers, *args, **kwargs): + + key = (url, headers.get("authorization", None)) + result_time, result = cache.get(key, (None, None)) + if result_time is not None and (time.time() - result_time) < ttl: + return result + result = f(url, headers, *args, **kwargs) + cache[key] = (time.time(), result) + return result + + return wrapped + + return true_decorator + + +@timed_cache(ttl=10) +def request_head(url, headers, allow_redirects, proxies, timeout): + + print("----") + print(f"Fetching {url}") + r = requests.head(url, headers=headers, allow_redirects=allow_redirects, proxies=proxies, timeout=timeout) + return r + + def get_from_cache( url: str, cache_dir=None, @@ -497,7 +528,13 @@ def get_from_cache( etag = None if not local_files_only: try: - r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout) + r = request_head( + url, + headers=headers, + allow_redirects=False, + proxies=proxies, + timeout=etag_timeout, + ) _raise_for_status(r) etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag") # We favor a custom header indicating the etag of the linked resource, and From 0b7794d05ec00ecdb6409a8488775c32e37a44ee Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 5 Aug 2022 16:04:32 +0200 Subject: [PATCH 2/4] Dirty fix of metadata passing. --- src/transformers/models/auto/auto_factory.py | 4 +- .../models/auto/tokenization_auto.py | 11 +- src/transformers/pipelines/__init__.py | 5 +- src/transformers/tokenization_utils_base.py | 4 +- src/transformers/utils/hub.py | 19 +- tests/utils/test_file_utils.py | 209 +++++++++++++++++- 6 files changed, 240 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index b7d8f66c339dd4..b139e284a28420 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -420,7 +420,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): trust_remote_code = kwargs.pop("trust_remote_code", False) kwargs["_from_auto"] = True if not isinstance(config, PretrainedConfig): - config, kwargs = AutoConfig.from_pretrained( + # TODO this seems to eat up from_auto_class and `_from_pipeline` kwargs + # which are necessary for tracking. + config, _kwargs = AutoConfig.from_pretrained( pretrained_model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **kwargs ) if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 7a2dc2941fdd09..c9c8a7fa3e1fbc 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -389,6 +389,14 @@ def get_tokenizer_config( tokenizer.save_pretrained("tokenizer-test") tokenizer_config = get_tokenizer_config("tokenizer-test") ```""" + user_agent = { + "file_type": "tokenizer", + "from_auto_class": kwargs.get("_from_auto", False), + "is_fast": kwargs.get("is_fast", False), + } + if "_from_pipeline" in kwargs: + user_agent["using_pipeline"] = kwargs.get("_from_pipeline") + resolved_config_file = get_file_from_repo( pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, @@ -399,6 +407,7 @@ def get_tokenizer_config( use_auth_token=use_auth_token, revision=revision, local_files_only=local_files_only, + user_agent=user_agent, ) if resolved_config_file is None: logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.") @@ -502,7 +511,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): config = kwargs.pop("config", None) kwargs["_from_auto"] = True - use_fast = kwargs.pop("use_fast", True) + use_fast = kwargs.get("use_fast", True) tokenizer_type = kwargs.pop("tokenizer_type", None) trust_remote_code = kwargs.pop("trust_remote_code", False) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index d2a4b663801d78..5456ff8a31bd13 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -579,10 +579,11 @@ def pipeline( # Config is the primordial information item. # Instantiate config if needed + _from_pipeline = "auto" if task is None else task if isinstance(config, str): - config = AutoConfig.from_pretrained(config, _from_pipeline=task, **hub_kwargs, **model_kwargs) + config = AutoConfig.from_pretrained(config, _from_pipeline=_from_pipeline, **hub_kwargs, **model_kwargs) elif config is None and isinstance(model, str): - config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs) + config = AutoConfig.from_pretrained(model, _from_pipeline=_from_pipeline, **hub_kwargs, **model_kwargs) custom_tasks = {} if config is not None and len(getattr(config, "custom_pipelines", {})) > 0: diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index fc1c0ff8da3b32..919d01422fbdda 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1701,6 +1701,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], revision=revision, local_files_only=local_files_only, subfolder=subfolder, + user_agent=user_agent, ) if resolved_config_file is not None: with open(resolved_config_file, encoding="utf-8") as reader: @@ -3196,8 +3197,7 @@ def truncate_sequences( ) if truncation_strategy == TruncationStrategy.ONLY_FIRST: error_msg = ( - error_msg - + "Please select another truncation strategy than " + error_msg + "Please select another truncation strategy than " f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." ) logger.error(error_msg) diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 40c55be8246e95..775f7b4ead8d7a 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -473,20 +473,27 @@ def wrapped(url, headers, *args, **kwargs): cache[key] = (time.time(), result) return result + def clear_cache(): + nonlocal cache + cache.clear() + + wrapped.clear_cache = clear_cache + return wrapped return true_decorator -@timed_cache(ttl=10) -def request_head(url, headers, allow_redirects, proxies, timeout): - - print("----") - print(f"Fetching {url}") +def _request_head(url, headers, allow_redirects, proxies, timeout): r = requests.head(url, headers=headers, allow_redirects=allow_redirects, proxies=proxies, timeout=timeout) return r +@timed_cache(ttl=10) +def request_head(url, headers, allow_redirects, proxies, timeout): + return _request_head(url, headers, allow_redirects, proxies, timeout) + + def get_from_cache( url: str, cache_dir=None, @@ -881,6 +888,7 @@ def get_file_from_repo( revision: Optional[str] = None, local_files_only: bool = False, subfolder: str = "", + user_agent: Dict[str, str] = None, ): """ Tries to locate a file in a local folder and repo, downloads and cache it if necessary. @@ -948,6 +956,7 @@ def get_file_from_repo( subfolder=subfolder, _raise_exceptions_for_missing_entries=False, _raise_exceptions_for_connection_errors=False, + user_agent=user_agent, ) diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py index 19adfe21dd4bf6..5abcd00dd2e3da 100644 --- a/tests/utils/test_file_utils.py +++ b/tests/utils/test_file_utils.py @@ -24,7 +24,8 @@ # Try to import everything from transformers to ensure every object can be loaded. from transformers import * # noqa F406 -from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER +from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer, pipeline +from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, require_tokenizers, require_torch from transformers.utils import ( CONFIG_NAME, FLAX_WEIGHTS_NAME, @@ -218,3 +219,209 @@ def test_find_labels(self): self.assertEqual(find_labels(FlaxBertForSequenceClassification), []) self.assertEqual(find_labels(FlaxBertForPreTraining), []) self.assertEqual(find_labels(FlaxBertForQuestionAnswering), []) + + +class TelemetryTest(unittest.TestCase): + url = "https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/" + + def setUp(self): + import transformers.utils.hub + + self.real_request_head = transformers.utils.hub._request_head + + self.calls = [] + + def counter_request_head(*args, **kwargs): + self.calls.append([args, kwargs]) + return self.real_request_head(*args, **kwargs) + + transformers.utils.hub._request_head = counter_request_head + + def tearDown(self): + transformers.utils.hub._request_head = self.real_request_head + transformers.utils.hub.request_head.clear_cache() + + @require_torch + def test_pipeline(self): + pipeline(task="text-generation", model="hf-internal-testing/tiny-random-gpt2") + + files = [args[0][len(self.url) :] for args, _ in self.calls] + self.assertEqual( + files, + [ + "config.json", + "pytorch_model.bin", + "tokenizer_config.json", + "vocab.json", + "merges.txt", + "tokenizer.json", + "added_tokens.json", + "special_tokens_map.json", + ], + ) + + for file, (args, _) in zip(files, self.calls): + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "using_pipeline/text-generation", + user_agent, + f"user_agent is incorrect for file {file}", + ) + + @require_tokenizers + def test_tokenizer(self): + AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + + files = [args[0][len(self.url) :] for args, _ in self.calls] + self.assertEqual( + files, + [ + "tokenizer_config.json", + "vocab.json", + "merges.txt", + "tokenizer.json", + "added_tokens.json", + "special_tokens_map.json", + ], + ) + + for file, (args, _) in zip(files, self.calls): + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "file_type/tokenizer", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "is_fast/True", + user_agent, + f"user_agent is incorrect for file {file}", + ) + + def test_slow_tokenizer(self): + AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_fast=False) + + files = [args[0][len(self.url) :] for args, _ in self.calls] + self.assertEqual( + files, + [ + "tokenizer_config.json", + "vocab.json", + "merges.txt", + "added_tokens.json", + "special_tokens_map.json", + ], + ) + + for file, (args, _) in zip(files, self.calls): + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "file_type/tokenizer", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "from_auto_class/True", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "is_fast/False", + user_agent, + f"user_agent is incorrect for file {file}", + ) + + @require_torch + def test_model(self): + AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2") + + files = [args[0][len(self.url) :] for args, _ in self.calls] + self.assertEqual( + files, + [ + "config.json", + "pytorch_model.bin", + ], + ) + + for file, (args, _) in zip(files, self.calls): + if file == "config.json": + # Skip config + continue + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "file_type/model", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "from_auto_class/True", + user_agent, + f"user_agent is incorrect for file {file}", + ) + + @require_torch + def test_model_raw(self): + from transformers import GPT2Model + + GPT2Model.from_pretrained("hf-internal-testing/tiny-random-gpt2") + + files = [args[0][len(self.url) :] for args, _ in self.calls] + self.assertEqual( + files, + [ + "config.json", + "pytorch_model.bin", + ], + ) + + for file, (args, _) in zip(files, self.calls): + if file == "config.json": + # Skip config + continue + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "file_type/model", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "from_auto_class/False", + user_agent, + f"user_agent is incorrect for file {file}", + ) + + def test_feature_extractor(self): + AutoFeatureExtractor.from_pretrained("hf-internal-testing/tiny-random-wav2vec2") + url = "https://huggingface.co/hf-internal-testing/tiny-random-wav2vec2/resolve/main/" + + files = [args[0][len(url) :] for args, _ in self.calls] + self.assertEqual( + files, + ["preprocessor_config.json"], + ) + + for file, (args, _) in zip(files, self.calls): + headers = args[1] + user_agent = headers["user-agent"] + + self.assertIn( + "file_type/feature extractor", + user_agent, + f"user_agent is incorrect for file {file}", + ) + self.assertIn( + "from_auto_class/True", + user_agent, + f"user_agent is incorrect for file {file}", + ) From 7d3e800ad9b2976375be84874a4a48386df3f117 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Fri, 5 Aug 2022 16:50:34 +0200 Subject: [PATCH 3/4] Adding the tests to check number of calls. --- tests/utils/test_file_utils.py | 51 +++++++++++++++------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/tests/utils/test_file_utils.py b/tests/utils/test_file_utils.py index 5abcd00dd2e3da..dca99e7c72d9b7 100644 --- a/tests/utils/test_file_utils.py +++ b/tests/utils/test_file_utils.py @@ -20,6 +20,7 @@ import unittest from pathlib import Path +import requests import transformers # Try to import everything from transformers to ensure every object can be loaded. @@ -225,27 +226,25 @@ class TelemetryTest(unittest.TestCase): url = "https://huggingface.co/hf-internal-testing/tiny-random-gpt2/resolve/main/" def setUp(self): - import transformers.utils.hub - self.real_request_head = transformers.utils.hub._request_head + self.real_request = requests.request self.calls = [] def counter_request_head(*args, **kwargs): self.calls.append([args, kwargs]) - return self.real_request_head(*args, **kwargs) + return self.real_request(*args, **kwargs) - transformers.utils.hub._request_head = counter_request_head + requests.request = counter_request_head def tearDown(self): - transformers.utils.hub._request_head = self.real_request_head - transformers.utils.hub.request_head.clear_cache() + requests.request = self.real_request @require_torch def test_pipeline(self): pipeline(task="text-generation", model="hf-internal-testing/tiny-random-gpt2") - files = [args[0][len(self.url) :] for args, _ in self.calls] + files = [kwargs["url"][len(self.url) :] for _, kwargs in self.calls] self.assertEqual( files, [ @@ -260,9 +259,8 @@ def test_pipeline(self): ], ) - for file, (args, _) in zip(files, self.calls): - headers = args[1] - user_agent = headers["user-agent"] + for file, (_, kwargs) in zip(files, self.calls): + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "using_pipeline/text-generation", @@ -274,7 +272,7 @@ def test_pipeline(self): def test_tokenizer(self): AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") - files = [args[0][len(self.url) :] for args, _ in self.calls] + files = [kwargs["url"][len(self.url) :] for _, kwargs in self.calls] self.assertEqual( files, [ @@ -287,9 +285,8 @@ def test_tokenizer(self): ], ) - for file, (args, _) in zip(files, self.calls): - headers = args[1] - user_agent = headers["user-agent"] + for file, (_, kwargs) in zip(files, self.calls): + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "file_type/tokenizer", @@ -305,7 +302,7 @@ def test_tokenizer(self): def test_slow_tokenizer(self): AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_fast=False) - files = [args[0][len(self.url) :] for args, _ in self.calls] + files = [kwargs["url"][len(self.url) :] for _, kwargs in self.calls] self.assertEqual( files, [ @@ -317,9 +314,8 @@ def test_slow_tokenizer(self): ], ) - for file, (args, _) in zip(files, self.calls): - headers = args[1] - user_agent = headers["user-agent"] + for file, (_, kwargs) in zip(files, self.calls): + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "file_type/tokenizer", @@ -341,7 +337,7 @@ def test_slow_tokenizer(self): def test_model(self): AutoModel.from_pretrained("hf-internal-testing/tiny-random-gpt2") - files = [args[0][len(self.url) :] for args, _ in self.calls] + files = [kwargs["url"][len(self.url) :] for _, kwargs in self.calls] self.assertEqual( files, [ @@ -350,12 +346,11 @@ def test_model(self): ], ) - for file, (args, _) in zip(files, self.calls): + for file, (_, kwargs) in zip(files, self.calls): if file == "config.json": # Skip config continue - headers = args[1] - user_agent = headers["user-agent"] + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "file_type/model", @@ -374,7 +369,7 @@ def test_model_raw(self): GPT2Model.from_pretrained("hf-internal-testing/tiny-random-gpt2") - files = [args[0][len(self.url) :] for args, _ in self.calls] + files = [kwargs["url"][len(self.url) :] for _, kwargs in self.calls] self.assertEqual( files, [ @@ -383,12 +378,11 @@ def test_model_raw(self): ], ) - for file, (args, _) in zip(files, self.calls): + for file, (_, kwargs) in zip(files, self.calls): if file == "config.json": # Skip config continue - headers = args[1] - user_agent = headers["user-agent"] + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "file_type/model", @@ -411,9 +405,8 @@ def test_feature_extractor(self): ["preprocessor_config.json"], ) - for file, (args, _) in zip(files, self.calls): - headers = args[1] - user_agent = headers["user-agent"] + for file, (_, kwargs) in zip(files, self.calls): + user_agent = kwargs["headers"]["user-agent"] self.assertIn( "file_type/feature extractor", From d67627fdd21f9a53adeef5b1d2fe68c8a0ff1ccc Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 8 Aug 2022 10:35:20 +0200 Subject: [PATCH 4/4] Attempt at fixing tests. --- src/transformers/models/auto/auto_factory.py | 6 +++++- src/transformers/utils/hub.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index b139e284a28420..8a1621d3831845 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -422,9 +422,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if not isinstance(config, PretrainedConfig): # TODO this seems to eat up from_auto_class and `_from_pipeline` kwargs # which are necessary for tracking. - config, _kwargs = AutoConfig.from_pretrained( + from_auto_class = kwargs.pop("_from_auto", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + config, kwargs = AutoConfig.from_pretrained( pretrained_model_name_or_path, return_unused_kwargs=True, trust_remote_code=trust_remote_code, **kwargs ) + kwargs["_from_auto"] = from_auto_class + kwargs["_from_pipeline"] = from_pipeline if hasattr(config, "auto_map") and cls.__name__ in config.auto_map: if not trust_remote_code: raise ValueError( diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py index 775f7b4ead8d7a..876f6b9656681b 100644 --- a/src/transformers/utils/hub.py +++ b/src/transformers/utils/hub.py @@ -26,8 +26,8 @@ import sys import tarfile import tempfile -import traceback import time +import traceback import warnings from contextlib import contextmanager from functools import partial