From 3696f64a225a03ce0ac2ae8dac8e1a1204e865cb Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Thu, 3 Feb 2022 06:43:30 -0500
Subject: [PATCH 01/17] Add initial BNB integration

---
 src/transformers/file_utils.py    |  3 +++
 src/transformers/trainer.py       |  8 ++++++
 src/transformers/training_args.py |  1 +
 tests/test_trainer.py             | 41 +++++++++++++++++++++++++------
 4 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 36d4a005a8d886..8dae6140a7a5c9 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -475,6 +475,9 @@ def is_py3nvml_available():
 def is_apex_available():
     return importlib.util.find_spec("apex") is not None
 
+def is_bnb_available():
+    return importlib.util.find_spec("bnb") is not None
+
 
 def is_faiss_available():
     return _faiss_available
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index fcc37919f3c6a9..f32b869c2ad2cc 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -897,6 +897,14 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(adam_kwargs)
             except ImportError:
                 raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
+        elif args.optim == OptimizerNames.ADAM_BNB_8BIT:
+            try:
+                from bnb.optim import Adam8bit
+
+                optimizer_cls = Adam8bit
+                optimizer_kwargs.update(adam_kwargs)
+            except ImportError:
+                raise ValueError("Trainer tried to instantiate bnb Adam8bit but bnb is not installed!")
         else:
             raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}")
         return optimizer_cls, optimizer_kwargs
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 0458110e41e57d..9e5a25f774ce46 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -80,6 +80,7 @@ class OptimizerNames(ExplicitEnum):
     ADAMW_TORCH_XLA = "adamw_torch_xla"
     ADAMW_APEX_FUSED = "adamw_apex_fused"
     ADAFACTOR = "adafactor"
+    ADAM_BNB_8BIT = "adam_bnb_8bit"
 
 
 @dataclass
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index cf275f127e6260..cd40aaba8e9bca 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -38,7 +38,7 @@
     is_torch_available,
     logging,
 )
-from transformers.file_utils import WEIGHTS_NAME, is_apex_available
+from transformers.file_utils import WEIGHTS_NAME, is_apex_available, is_bnb_available
 from transformers.testing_utils import (
     ENDPOINT_STAGING,
     PASS,
@@ -1752,13 +1752,13 @@ def hp_name(trial):
             },
         ),
     ]
-    if is_apex_available():
-        import apex
+    if is_bnb_available():
+        import bnb
 
         optim_test_params.append(
             (
-                OptimizerNames.ADAMW_APEX_FUSED,
-                apex.optimizers.FusedAdam,
+                OptimizerNames.ADAM_BNB_8BIT,
+                bnb.optim.Adam8bit,
                 default_adam_kwargs,
             )
         )
@@ -1787,8 +1787,8 @@ def test_optim_supported(self, name: str, expected_cls, mandatory_kwargs):
 
     def test_fused_adam(self):
         # Pretend that apex is installed and mock apex.optimizers.FusedAdam exists.
-        # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam, but only has to return a
-        # class called, so mocking apex.optimizers.FusedAdam should be fine for testing and allow
+        # Trainer.get_optimizer_cls_and_kwargs does not use FusedAdam. It only has to return the
+        # class given, so mocking apex.optimizers.FusedAdam should be fine for testing and allow
         # the test to run without requiring an apex installation.
         mock = Mock()
         modules = {
@@ -1812,6 +1812,33 @@ def test_fused_adam_no_apex(self):
             with self.assertRaises(ValueError):
                 Trainer.get_optimizer_cls_and_kwargs(args)
 
+    def test_bnb_adam8bit(self):
+        # Pretend that Bits and Bytes is installed and mock bnb.optim.Adam8bit exists.
+        # Trainer.get_optimizer_cls_and_kwargs does not use Adam8bit. It only has to return the
+        # class given, so mocking bnb.optim.Adam8bit should be fine for testing and allow
+        # the test to run without requiring a bnb installation.
+        mock = Mock()
+        modules = {
+            "bnb": mock,
+            "bnb.optim": mock.optim,
+            "bnb.optim.Adam8bit": mock.optim.Adam8bit,
+        }
+        with patch.dict("sys.modules", modules):
+            self.check_optim_and_kwargs(
+                OptimizerNames.ADAM_BNB_8BIT,
+                default_adam_kwargs,
+                mock.optim.Adam8bit,
+            )
+
+    def test_bnb_adam8bit_no_bnb(self):
+        args = TrainingArguments(optim=OptimizerNames.ADAM_BNB_8BIT, output_dir="None")
+
+        # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
+        # bnb will fail even if bnb is installed.
+        with patch.dict("sys.modules", {"bnb.optim": None}):
+            with self.assertRaises(ValueError):
+                Trainer.get_optimizer_cls_and_kwargs(args)
+
 
 @require_torch
 @require_wandb

From ba8790ca4d151666b444cec008cca07e9607ffe9 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Thu, 3 Feb 2022 06:45:31 -0500
Subject: [PATCH 02/17] fixup! Add initial BNB integration

---
 tests/test_trainer.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index cd40aaba8e9bca..715e9adeb0ef93 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1752,6 +1752,16 @@ def hp_name(trial):
             },
         ),
     ]
+    if is_apex_available():
+        import apex
+
+        optim_test_params.append(
+            (
+                OptimizerNames.ADAMW_APEX_FUSED,
+                apex.optimizers.FusedAdam,
+                default_adam_kwargs,
+            )
+        )
     if is_bnb_available():
         import bnb
 

From 16df6c83cb7b2df5399cb90d015c974323373816 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Thu, 3 Feb 2022 06:49:25 -0500
Subject: [PATCH 03/17] Add bnb test decorator

---
 tests/extended/test_trainer_ext.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 3a65f16580fa33..3469d15230e5b4 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -20,7 +20,7 @@
 from unittest.mock import patch
 
 from parameterized import parameterized
-from transformers.file_utils import is_apex_available
+from transformers.file_utils import is_apex_available, is_bnb_available
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
     CaptureStderr,
@@ -71,6 +71,17 @@ def require_apex(test_case):
         return test_case
 
 
+# a candidate for testing_utils
+def require_bnb(test_case):
+    """
+    Decorator for bits and bytes (bnb) dependency
+    """
+    if not is_bnb_available():
+        return unittest.skip("test requires bnb")(test_case)
+    else:
+        return test_case
+
+
 @require_torch
 class TestTrainerExt(TestCasePlus):
     def run_seq2seq_quick(

From 97bd33bbd10744999a164986284009534fda9fb4 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Thu, 3 Feb 2022 06:53:28 -0500
Subject: [PATCH 04/17] Update Adamw8bit option name

---
 src/transformers/trainer.py       | 2 +-
 src/transformers/training_args.py | 2 +-
 tests/test_trainer.py             | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index f32b869c2ad2cc..10f8dee78414a4 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -897,7 +897,7 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 optimizer_kwargs.update(adam_kwargs)
             except ImportError:
                 raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
-        elif args.optim == OptimizerNames.ADAM_BNB_8BIT:
+        elif args.optim == OptimizerNames.ADAMW_BNB:
             try:
                 from bnb.optim import Adam8bit
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 9e5a25f774ce46..cca3190ea2e6eb 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -80,7 +80,7 @@ class OptimizerNames(ExplicitEnum):
     ADAMW_TORCH_XLA = "adamw_torch_xla"
     ADAMW_APEX_FUSED = "adamw_apex_fused"
     ADAFACTOR = "adafactor"
-    ADAM_BNB_8BIT = "adam_bnb_8bit"
+    ADAMW_BNB = "adamw_bnb"
 
 
 @dataclass
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 715e9adeb0ef93..13cf12be833db8 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1767,7 +1767,7 @@ def hp_name(trial):
 
         optim_test_params.append(
             (
-                OptimizerNames.ADAM_BNB_8BIT,
+                OptimizerNames.ADAMW_BNB,
                 bnb.optim.Adam8bit,
                 default_adam_kwargs,
             )
@@ -1835,13 +1835,13 @@ def test_bnb_adam8bit(self):
         }
         with patch.dict("sys.modules", modules):
             self.check_optim_and_kwargs(
-                OptimizerNames.ADAM_BNB_8BIT,
+                OptimizerNames.ADAMW_BNB,
                 default_adam_kwargs,
                 mock.optim.Adam8bit,
             )
 
     def test_bnb_adam8bit_no_bnb(self):
-        args = TrainingArguments(optim=OptimizerNames.ADAM_BNB_8BIT, output_dir="None")
+        args = TrainingArguments(optim=OptimizerNames.ADAMW_BNB, output_dir="None")
 
         # Pretend that bnb does not exist, even if installed. By setting bnb to None, importing
         # bnb will fail even if bnb is installed.

From 22abb9c130fc4b061c87eeb3f0de642183b31d01 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 11 Feb 2022 06:07:36 -0500
Subject: [PATCH 05/17] Use the full bnb package name

---
 src/transformers/trainer.py | 2 +-
 tests/test_trainer.py       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 10f8dee78414a4..229e39d797d018 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -899,7 +899,7 @@ def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
                 raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!")
         elif args.optim == OptimizerNames.ADAMW_BNB:
             try:
-                from bnb.optim import Adam8bit
+                from bitsandbytes.optim import Adam8bit
 
                 optimizer_cls = Adam8bit
                 optimizer_kwargs.update(adam_kwargs)
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 13cf12be833db8..16fa649797d8ed 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -1763,7 +1763,7 @@ def hp_name(trial):
             )
         )
     if is_bnb_available():
-        import bnb
+        import bitsandbytes as bnb
 
         optim_test_params.append(
             (
@@ -1829,9 +1829,9 @@ def test_bnb_adam8bit(self):
         # the test to run without requiring a bnb installation.
         mock = Mock()
         modules = {
-            "bnb": mock,
-            "bnb.optim": mock.optim,
-            "bnb.optim.Adam8bit": mock.optim.Adam8bit,
+            "bitsandbytes": mock,
+            "bitsandbytes.optim": mock.optim,
+            "bitsandbytes.optim.Adam8bit": mock.optim.Adam8bit,
         }
         with patch.dict("sys.modules", modules):
             self.check_optim_and_kwargs(

From 226b3ddf01ebf23af7bcc494a7da1986f7c16f1f Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 11 Feb 2022 07:26:10 -0500
Subject: [PATCH 06/17] Overide bnb for all embedding layers

---
 src/transformers/trainer.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 229e39d797d018..d219a39567037a 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -847,6 +847,17 @@ def create_optimizer(self):
                 )
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+                if optimizer_cls.__name__ == "Adam8bit":
+                    import bitsandbytes
+                    from torch.nn import Embedding
+
+                    manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                    for module in self.model.modules():
+                        if isinstance(module, Embedding):
+                            manager.register_module_override(module, "weight", {"optim_bits": 32})
+                            logger.info(f"Registering bitsandbytes override for {module}")
+
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)

From 1d03d49d3bf3b8ff0d2495e89b6fb21eefb3f4be Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 11 Feb 2022 07:27:07 -0500
Subject: [PATCH 07/17] Fix package name

---
 src/transformers/file_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 8dae6140a7a5c9..b664b90be0c07e 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -476,7 +476,7 @@ def is_apex_available():
     return importlib.util.find_spec("apex") is not None
 
 def is_bnb_available():
-    return importlib.util.find_spec("bnb") is not None
+    return importlib.util.find_spec("bitsandbytes") is not None
 
 
 def is_faiss_available():

From 22d7112ae0a316e7441f3489dc1c8a0b382195d6 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 11 Feb 2022 07:28:51 -0500
Subject: [PATCH 08/17] Formatting

---
 src/transformers/file_utils.py | 1 +
 src/transformers/trainer.py    | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index b664b90be0c07e..9d036adf0a8d51 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -475,6 +475,7 @@ def is_py3nvml_available():
 def is_apex_available():
     return importlib.util.find_spec("apex") is not None
 
+
 def is_bnb_available():
     return importlib.util.find_spec("bitsandbytes") is not None
 
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index d219a39567037a..bbbd5083ff51d6 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -848,9 +848,10 @@ def create_optimizer(self):
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                 if optimizer_cls.__name__ == "Adam8bit":
-                    import bitsandbytes
                     from torch.nn import Embedding
 
+                    import bitsandbytes
+
                     manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
 
                     for module in self.model.modules():
@@ -858,7 +859,6 @@ def create_optimizer(self):
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
                             logger.info(f"Registering bitsandbytes override for {module}")
 
-
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)
 

From b048a34e38b8eebb95b59a2cf10e7e327cc6e62e Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 11 Feb 2022 10:23:21 -0500
Subject: [PATCH 09/17] Remove unnecessary import

---
 src/transformers/trainer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index bbbd5083ff51d6..a81c852e6b8f17 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -848,14 +848,12 @@ def create_optimizer(self):
             else:
                 self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                 if optimizer_cls.__name__ == "Adam8bit":
-                    from torch.nn import Embedding
-
                     import bitsandbytes
 
                     manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
 
                     for module in self.model.modules():
-                        if isinstance(module, Embedding):
+                        if isinstance(module, nn.Embedding):
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
                             logger.info(f"Registering bitsandbytes override for {module}")
 

From 2c04486415f995a1b4f9a2ef3e02785c0e20ea8c Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Wed, 30 Mar 2022 13:53:23 -0700
Subject: [PATCH 10/17] Update src/transformers/trainer.py

Co-authored-by: Stas Bekman <stas00@users.noreply.github.com>
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index a81c852e6b8f17..2e996e076d13db 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -855,7 +855,7 @@ def create_optimizer(self):
                     for module in self.model.modules():
                         if isinstance(module, nn.Embedding):
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
-                            logger.info(f"Registering bitsandbytes override for {module}")
+                            logger.debug(f"bitsandbytes: will optimize {module} in fp32")
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)

From b7a1c0c3372c7ae6b30157b754ac5cc2fb338d70 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Wed, 30 Mar 2022 13:55:19 -0700
Subject: [PATCH 11/17] Rename AdamwBNB optimizer option

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index cca3190ea2e6eb..7ad54d68ae253f 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -80,7 +80,7 @@ class OptimizerNames(ExplicitEnum):
     ADAMW_TORCH_XLA = "adamw_torch_xla"
     ADAMW_APEX_FUSED = "adamw_apex_fused"
     ADAFACTOR = "adafactor"
-    ADAMW_BNB = "adamw_bnb"
+    ADAMW_BNB = "adamw_bnb_8bit"
 
 
 @dataclass

From 8cb259b512835661acb0caf30a29391eb2435b65 Mon Sep 17 00:00:00 2001
From: "Manuel R. Ciosici" <manuelrciosici@gmail.com>
Date: Fri, 8 Apr 2022 14:44:18 -0700
Subject: [PATCH 12/17] Add training test checking that bnb memory utilization
 is lower

---
 tests/extended/test_trainer_ext.py | 38 ++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 3469d15230e5b4..8687875d1a375c 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -17,6 +17,7 @@
 import re
 import sys
 import unittest
+from typing import Tuple
 from unittest.mock import patch
 
 from parameterized import parameterized
@@ -229,6 +230,43 @@ def test_run_seq2seq_slow(self):
         assert "generated_predictions.txt" in contents
         assert "predict_results.json" in contents
 
+    @slow
+    @require_bnb
+    def test_run_seq2seq_bnb_slow(self):
+        from transformers.training_args import OptimizerNames
+
+        def train_and_return_metrics(optim: str) -> Tuple[int, float]:
+            from pathlib import Path
+
+            extra_args = (
+                "--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
+                "False --adafactor False --log_level debug"
+            )
+
+            output_dir = self.run_trainer(
+                eval_steps=2,
+                max_len=128,
+                model_name=MARIAN_MODEL,
+                learning_rate=3e-4,
+                num_train_epochs=1,
+                distributed=False,
+                extra_args_str=extra_args.format(optim=optim),
+                do_eval=False,
+                do_predict=False,
+            )
+
+            # Check metrics
+            logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
+            gpu_peak_memory = logs[0]["train_mem_gpu_peaked_delta"]
+            loss = logs[0]["train_loss"]
+            return gpu_peak_memory, loss
+
+        original_gpu_peak_memory, original_loss = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
+        bnb_gpu_peak_memory, bnb_loss = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
+
+        assert original_gpu_peak_memory < bnb_gpu_peak_memory
+        assert original_loss == bnb_loss
+
     def run_trainer(
         self,
         eval_steps: int,

From e8bf8d08acc88eebc7d48be6503e8247233f979b Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 14 Apr 2022 19:43:48 -0700
Subject: [PATCH 13/17] fix merge

---
 src/transformers/file_utils.py | 780 +--------------------------------
 1 file changed, 1 insertion(+), 779 deletions(-)

diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 774f27306683c7..4b93c496ce9ea0 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -16,785 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Utilities for working with the local dataset cache. Parts of this file is adapted from the AllenNLP library at
-https://github.com/allenai/allennlp.
-"""
-import copy
-import fnmatch
-import functools
-import importlib.util
-import io
-import json
-import os
-import re
-import shutil
-import subprocess
-import sys
-import tarfile
-import tempfile
-import types
-from collections import OrderedDict, UserDict
-from contextlib import ExitStack, contextmanager
-from dataclasses import fields
-from enum import Enum
-from functools import partial, wraps
-from hashlib import sha256
-from itertools import chain
-from pathlib import Path
-from types import ModuleType
-from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union
-from urllib.parse import urlparse
-from uuid import uuid4
-from zipfile import ZipFile, is_zipfile
-
-import numpy as np
-from packaging import version
-
-import requests
-from filelock import FileLock
-from huggingface_hub import HfFolder, Repository, create_repo, list_repo_files, whoami
-from requests.exceptions import HTTPError
-from transformers.utils.logging import tqdm
-from transformers.utils.versions import importlib_metadata
-
-from . import __version__
-from .utils import logging
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
-ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
-
-USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-USE_JAX = os.environ.get("USE_FLAX", "AUTO").upper()
-
-if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES:
-    _torch_available = importlib.util.find_spec("torch") is not None
-    if _torch_available:
-        try:
-            _torch_version = importlib_metadata.version("torch")
-            logger.info(f"PyTorch version {_torch_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _torch_available = False
-else:
-    logger.info("Disabling PyTorch because USE_TF is set")
-    _torch_available = False
-
-
-if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES:
-    _tf_available = importlib.util.find_spec("tensorflow") is not None
-    if _tf_available:
-        candidates = (
-            "tensorflow",
-            "tensorflow-cpu",
-            "tensorflow-gpu",
-            "tf-nightly",
-            "tf-nightly-cpu",
-            "tf-nightly-gpu",
-            "intel-tensorflow",
-            "intel-tensorflow-avx512",
-            "tensorflow-rocm",
-            "tensorflow-macos",
-        )
-        _tf_version = None
-        # For the metadata, we have to look for both tensorflow and tensorflow-cpu
-        for pkg in candidates:
-            try:
-                _tf_version = importlib_metadata.version(pkg)
-                break
-            except importlib_metadata.PackageNotFoundError:
-                pass
-        _tf_available = _tf_version is not None
-    if _tf_available:
-        if version.parse(_tf_version) < version.parse("2"):
-            logger.info(f"TensorFlow found but with version {_tf_version}. Transformers requires version 2 minimum.")
-            _tf_available = False
-        else:
-            logger.info(f"TensorFlow version {_tf_version} available.")
-else:
-    logger.info("Disabling Tensorflow because USE_TORCH is set")
-    _tf_available = False
-
-
-if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES:
-    _flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
-    if _flax_available:
-        try:
-            _jax_version = importlib_metadata.version("jax")
-            _flax_version = importlib_metadata.version("flax")
-            logger.info(f"JAX version {_jax_version}, Flax version {_flax_version} available.")
-        except importlib_metadata.PackageNotFoundError:
-            _flax_available = False
-else:
-    _flax_available = False
-
-
-_datasets_available = importlib.util.find_spec("datasets") is not None
-try:
-    # Check we're not importing a "datasets" directory somewhere but the actual library by trying to grab the version
-    # AND checking it has an author field in the metadata that is HuggingFace.
-    _ = importlib_metadata.version("datasets")
-    _datasets_metadata = importlib_metadata.metadata("datasets")
-    if _datasets_metadata.get("author", "") != "HuggingFace Inc.":
-        _datasets_available = False
-except importlib_metadata.PackageNotFoundError:
-    _datasets_available = False
-
-
-_detectron2_available = importlib.util.find_spec("detectron2") is not None
-try:
-    _detectron2_version = importlib_metadata.version("detectron2")
-    logger.debug(f"Successfully imported detectron2 version {_detectron2_version}")
-except importlib_metadata.PackageNotFoundError:
-    _detectron2_available = False
-
-
-_faiss_available = importlib.util.find_spec("faiss") is not None
-try:
-    _faiss_version = importlib_metadata.version("faiss")
-    logger.debug(f"Successfully imported faiss version {_faiss_version}")
-except importlib_metadata.PackageNotFoundError:
-    try:
-        _faiss_version = importlib_metadata.version("faiss-cpu")
-        logger.debug(f"Successfully imported faiss version {_faiss_version}")
-    except importlib_metadata.PackageNotFoundError:
-        _faiss_available = False
-
-
-coloredlogs = importlib.util.find_spec("coloredlogs") is not None
-try:
-    _coloredlogs_available = importlib_metadata.version("coloredlogs")
-    logger.debug(f"Successfully imported sympy version {_coloredlogs_available}")
-except importlib_metadata.PackageNotFoundError:
-    _coloredlogs_available = False
-
-
-sympy_available = importlib.util.find_spec("sympy") is not None
-try:
-    _sympy_available = importlib_metadata.version("sympy")
-    logger.debug(f"Successfully imported sympy version {_sympy_available}")
-except importlib_metadata.PackageNotFoundError:
-    _sympy_available = False
-
-
-_tf2onnx_available = importlib.util.find_spec("tf2onnx") is not None
-try:
-    _tf2onnx_version = importlib_metadata.version("tf2onnx")
-    logger.debug(f"Successfully imported tf2onnx version {_tf2onnx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tf2onnx_available = False
-
-_onnx_available = importlib.util.find_spec("onnxruntime") is not None
-try:
-    _onxx_version = importlib_metadata.version("onnx")
-    logger.debug(f"Successfully imported onnx version {_onxx_version}")
-except importlib_metadata.PackageNotFoundError:
-    _onnx_available = False
-
-
-_scatter_available = importlib.util.find_spec("torch_scatter") is not None
-try:
-    _scatter_version = importlib_metadata.version("torch_scatter")
-    logger.debug(f"Successfully imported torch-scatter version {_scatter_version}")
-except importlib_metadata.PackageNotFoundError:
-    _scatter_available = False
-
-
-_pytorch_quantization_available = importlib.util.find_spec("pytorch_quantization") is not None
-try:
-    _pytorch_quantization_version = importlib_metadata.version("pytorch_quantization")
-    logger.debug(f"Successfully imported pytorch-quantization version {_pytorch_quantization_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pytorch_quantization_available = False
-
-
-_soundfile_available = importlib.util.find_spec("soundfile") is not None
-try:
-    _soundfile_version = importlib_metadata.version("soundfile")
-    logger.debug(f"Successfully imported soundfile version {_soundfile_version}")
-except importlib_metadata.PackageNotFoundError:
-    _soundfile_available = False
-
-
-_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
-try:
-    _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
-    logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
-except importlib_metadata.PackageNotFoundError:
-    _tensorflow_probability_available = False
-
-
-_timm_available = importlib.util.find_spec("timm") is not None
-try:
-    _timm_version = importlib_metadata.version("timm")
-    logger.debug(f"Successfully imported timm version {_timm_version}")
-except importlib_metadata.PackageNotFoundError:
-    _timm_available = False
-
-
-_torchaudio_available = importlib.util.find_spec("torchaudio") is not None
-try:
-    _torchaudio_version = importlib_metadata.version("torchaudio")
-    logger.debug(f"Successfully imported torchaudio version {_torchaudio_version}")
-except importlib_metadata.PackageNotFoundError:
-    _torchaudio_available = False
-
-
-_phonemizer_available = importlib.util.find_spec("phonemizer") is not None
-try:
-    _phonemizer_version = importlib_metadata.version("phonemizer")
-    logger.debug(f"Successfully imported phonemizer version {_phonemizer_version}")
-except importlib_metadata.PackageNotFoundError:
-    _phonemizer_available = False
-
-
-_pyctcdecode_available = importlib.util.find_spec("pyctcdecode") is not None
-try:
-    _pyctcdecode_version = importlib_metadata.version("pyctcdecode")
-    logger.debug(f"Successfully imported pyctcdecode version {_pyctcdecode_version}")
-except importlib_metadata.PackageNotFoundError:
-    _pyctcdecode_available = False
-
-
-_librosa_available = importlib.util.find_spec("librosa") is not None
-try:
-    _librosa_version = importlib_metadata.version("librosa")
-    logger.debug(f"Successfully imported librosa version {_librosa_version}")
-except importlib_metadata.PackageNotFoundError:
-    _librosa_available = False
-
-
-torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-old_default_cache_path = os.path.join(torch_cache_home, "transformers")
-# New default cache, shared with the Datasets library
-hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
-)
-default_cache_path = os.path.join(hf_cache_home, "transformers")
-
-# Onetime move from the old location to the new one if no ENV variable has been set.
-if (
-    os.path.isdir(old_default_cache_path)
-    and not os.path.isdir(default_cache_path)
-    and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
-    and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
-    and "TRANSFORMERS_CACHE" not in os.environ
-):
-    logger.warning(
-        "In Transformers v4.0.0, the default path to cache downloaded models changed from "
-        "'~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden "
-        "and '~/.cache/torch/transformers' is a directory that exists, we're moving it to "
-        "'~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should "
-        "only see this message once."
-    )
-    shutil.move(old_default_cache_path, default_cache_path)
-
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
-TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
-SESSION_ID = uuid4().hex
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES
-
-WEIGHTS_NAME = "pytorch_model.bin"
-TF2_WEIGHTS_NAME = "tf_model.h5"
-TF_WEIGHTS_NAME = "model.ckpt"
-FLAX_WEIGHTS_NAME = "flax_model.msgpack"
-CONFIG_NAME = "config.json"
-FEATURE_EXTRACTOR_NAME = "preprocessor_config.json"
-MODEL_CARD_NAME = "modelcard.json"
-
-SENTENCEPIECE_UNDERLINE = "▁"
-SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
-
-MULTIPLE_CHOICE_DUMMY_INPUTS = [
-    [[0, 1, 0, 1], [1, 0, 0, 1]]
-] * 2  # Needs to have 0s and 1s only since XLM uses it for langs too.
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
-
-S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
-CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
-
-_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
-_default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co"
-
-HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", _default_endpoint)
-HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
-
-# This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
-TORCH_FX_REQUIRED_VERSION = version.parse("1.9")
-TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION = version.parse("1.8")
-
-_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
-
-
-def is_offline_mode():
-    return _is_offline_mode
-
-
-def is_torch_available():
-    return _torch_available
-
-
-def is_pyctcdecode_available():
-    return _pyctcdecode_available
-
-
-def is_librosa_available():
-    return _librosa_available
-
-
-def is_torch_cuda_available():
-    if is_torch_available():
-        import torch
-
-        return torch.cuda.is_available()
-    else:
-        return False
-
-
-def is_torch_bf16_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    # since currently no utility function is available we build our own.
-    # some bits come from https://github.com/pytorch/pytorch/blob/2289a12f21c54da93bf5d696e3f9aea83dd9c10d/torch/testing/_internal/common_cuda.py#L51
-    # with additional check for torch version
-    # to succeed:
-    # 1. the hardware needs to support bf16 (arch >= Ampere)
-    # 2. torch >= 1.10 (1.9 should be enough for AMP API has changed in 1.10, so using 1.10 as minimal)
-    # 3. CUDA >= 11
-    # 4. torch.autocast exists
-    # XXX: one problem here is that it may give invalid results on mixed gpus setup, so it's
-    # really only correct for the 0th gpu (or currently set default device if different from 0)
-
-    if not torch.cuda.is_available() or torch.version.cuda is None:
-        return False
-    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-        return False
-    if int(torch.version.cuda.split(".")[0]) < 11:
-        return False
-    if version.parse(torch.__version__) < version.parse("1.10"):
-        return False
-    if not hasattr(torch, "autocast"):
-        return False
-
-    return True
-
-
-def is_torch_tf32_available():
-    if not is_torch_available():
-        return False
-
-    import torch
-
-    if not torch.cuda.is_available() or torch.version.cuda is None:
-        return False
-    if torch.cuda.get_device_properties(torch.cuda.current_device()).major < 8:
-        return False
-    if int(torch.version.cuda.split(".")[0]) < 11:
-        return False
-    if version.parse(torch.__version__) < version.parse("1.7"):
-        return False
-
-    return True
-
-
-_torch_fx_available = _torch_onnx_dict_inputs_support_available = False
-if _torch_available:
-    torch_version = version.parse(importlib_metadata.version("torch"))
-    _torch_fx_available = (torch_version.major, torch_version.minor) == (
-        TORCH_FX_REQUIRED_VERSION.major,
-        TORCH_FX_REQUIRED_VERSION.minor,
-    )
-
-    _torch_onnx_dict_inputs_support_available = torch_version >= TORCH_ONNX_DICT_INPUTS_MINIMUM_VERSION
-
-
-def is_torch_fx_available():
-    return _torch_fx_available
-
-
-def is_torch_onnx_dict_inputs_support_available():
-    return _torch_onnx_dict_inputs_support_available
-
-
-def is_tf_available():
-    return _tf_available
-
-
-def is_coloredlogs_available():
-    return _coloredlogs_available
-
-
-def is_tf2onnx_available():
-    return _tf2onnx_available
-
-
-def is_onnx_available():
-    return _onnx_available
-
-
-def is_flax_available():
-    return _flax_available
-
-
-def is_torch_tpu_available():
-    if not _torch_available:
-        return False
-    # This test is probably enough, but just in case, we unpack a bit.
-    if importlib.util.find_spec("torch_xla") is None:
-        return False
-    if importlib.util.find_spec("torch_xla.core") is None:
-        return False
-    return importlib.util.find_spec("torch_xla.core.xla_model") is not None
-
-
-def is_datasets_available():
-    return _datasets_available
-
-
-def is_detectron2_available():
-    return _detectron2_available
-
-
-def is_rjieba_available():
-    return importlib.util.find_spec("rjieba") is not None
-
-
-def is_psutil_available():
-    return importlib.util.find_spec("psutil") is not None
-
-
-def is_py3nvml_available():
-    return importlib.util.find_spec("py3nvml") is not None
-
-
-def is_apex_available():
-    return importlib.util.find_spec("apex") is not None
-
-
-def is_bnb_available():
-    return importlib.util.find_spec("bitsandbytes") is not None
-
-
-def is_faiss_available():
-    return _faiss_available
-
-
-def is_scipy_available():
-    return importlib.util.find_spec("scipy") is not None
-
-
-def is_sklearn_available():
-    if importlib.util.find_spec("sklearn") is None:
-        return False
-    return is_scipy_available() and importlib.util.find_spec("sklearn.metrics")
-
-
-def is_sentencepiece_available():
-    return importlib.util.find_spec("sentencepiece") is not None
-
-
-def is_protobuf_available():
-    if importlib.util.find_spec("google") is None:
-        return False
-    return importlib.util.find_spec("google.protobuf") is not None
-
-
-def is_tokenizers_available():
-    return importlib.util.find_spec("tokenizers") is not None
-
-
-def is_vision_available():
-    return importlib.util.find_spec("PIL") is not None
-
-
-def is_pytesseract_available():
-    return importlib.util.find_spec("pytesseract") is not None
-
-
-def is_spacy_available():
-    return importlib.util.find_spec("spacy") is not None
-
-
-def is_ftfy_available():
-    return importlib.util.find_spec("ftfy") is not None
-
-
-def is_in_notebook():
-    try:
-        # Test adapted from tqdm.autonotebook: https://github.com/tqdm/tqdm/blob/master/tqdm/autonotebook.py
-        get_ipython = sys.modules["IPython"].get_ipython
-        if "IPKernelApp" not in get_ipython().config:
-            raise ImportError("console")
-        if "VSCODE_PID" in os.environ:
-            raise ImportError("vscode")
-
-        return importlib.util.find_spec("IPython") is not None
-    except (AttributeError, ImportError, KeyError):
-        return False
-
-
-def is_scatter_available():
-    return _scatter_available
-
-
-def is_pytorch_quantization_available():
-    return _pytorch_quantization_available
-
-
-def is_tensorflow_probability_available():
-    return _tensorflow_probability_available
-
-
-def is_pandas_available():
-    return importlib.util.find_spec("pandas") is not None
-
-
-def is_sagemaker_dp_enabled():
-    # Get the sagemaker specific env variable.
-    sagemaker_params = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
-    try:
-        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
-        sagemaker_params = json.loads(sagemaker_params)
-        if not sagemaker_params.get("sagemaker_distributed_dataparallel_enabled", False):
-            return False
-    except json.JSONDecodeError:
-        return False
-    # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
-
-
-def is_sagemaker_mp_enabled():
-    # Get the sagemaker specific mp parameters from smp_options variable.
-    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
-    try:
-        # Parse it and check the field "partitions" is included, it is required for model parallel.
-        smp_options = json.loads(smp_options)
-        if "partitions" not in smp_options:
-            return False
-    except json.JSONDecodeError:
-        return False
-
-    # Get the sagemaker specific framework parameters from mpi_options variable.
-    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
-    try:
-        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
-        mpi_options = json.loads(mpi_options)
-        if not mpi_options.get("sagemaker_mpi_enabled", False):
-            return False
-    except json.JSONDecodeError:
-        return False
-    # Lastly, check if the `smdistributed` module is present.
-    return importlib.util.find_spec("smdistributed") is not None
-
-
-def is_training_run_on_sagemaker():
-    return "SAGEMAKER_JOB_NAME" in os.environ
-
-
-def is_soundfile_availble():
-    return _soundfile_available
-
-
-def is_timm_available():
-    return _timm_available
-
-
-def is_torchaudio_available():
-    return _torchaudio_available
-
-
-def is_speech_available():
-    # For now this depends on torchaudio but the exact dependency might evolve in the future.
-    return _torchaudio_available
-
-
-def is_phonemizer_available():
-    return _phonemizer_available
-
-
-def torch_only_method(fn):
-    def wrapper(*args, **kwargs):
-        if not _torch_available:
-            raise ImportError(
-                "You need to install pytorch to use this method or class, "
-                "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
-            )
-        else:
-            return fn(*args, **kwargs)
-
-    return wrapper
-
-
-# docstyle-ignore
-DATASETS_IMPORT_ERROR = """
-{0} requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
-```
-pip install datasets
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install datasets
-```
-then restarting your kernel.
-
-Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
-working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
-that python file if that's the case.
-"""
-
-
-# docstyle-ignore
-TOKENIZERS_IMPORT_ERROR = """
-{0} requires the 🤗 Tokenizers library but it was not found in your environment. You can install it with:
-```
-pip install tokenizers
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install tokenizers
-```
-"""
-
-
-# docstyle-ignore
-SENTENCEPIECE_IMPORT_ERROR = """
-{0} requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PROTOBUF_IMPORT_ERROR = """
-{0} requires the protobuf library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-FAISS_IMPORT_ERROR = """
-{0} requires the faiss library but it was not found in your environment. Checkout the instructions on the
-installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-PYTORCH_IMPORT_ERROR = """
-{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
-installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-SKLEARN_IMPORT_ERROR = """
-{0} requires the scikit-learn library but it was not found in your environment. You can install it with:
-```
-pip install -U scikit-learn
-```
-In a notebook or a colab, you can install it by executing a cell with
-```
-!pip install -U scikit-learn
-```
-"""
-
-
-# docstyle-ignore
-TENSORFLOW_IMPORT_ERROR = """
-{0} requires the TensorFlow library but it was not found in your environment. Checkout the instructions on the
-installation page: https://www.tensorflow.org/install and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-DETECTRON2_IMPORT_ERROR = """
-{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
-installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
-that match your environment.
-"""
-
-
-# docstyle-ignore
-FLAX_IMPORT_ERROR = """
-{0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
-installation page: https://github.com/google/flax and follow the ones that match your environment.
-"""
-
-
-# docstyle-ignore
-SCATTER_IMPORT_ERROR = """
-{0} requires the torch-scatter library but it was not found in your environment. You can install it with pip as
-explained here: https://github.com/rusty1s/pytorch_scatter.
-"""
-
-# docstyle-ignore
-PYTORCH_QUANTIZATION_IMPORT_ERROR = """
-{0} requires the pytorch-quantization library but it was not found in your environment. You can install it with pip:
-`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
-"""
-
-# docstyle-ignore
-TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
-{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
-explained here: https://github.com/tensorflow/probability.
-"""
-
-
-# docstyle-ignore
-PANDAS_IMPORT_ERROR = """
-{0} requires the pandas library but it was not found in your environment. You can install it with pip as
-explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html.
-"""
-
-
-# docstyle-ignore
-PHONEMIZER_IMPORT_ERROR = """
-{0} requires the phonemizer library but it was not found in your environment. You can install it with pip:
-`pip install phonemizer`
-"""
-
-
-# docstyle-ignore
-SCIPY_IMPORT_ERROR = """
-{0} requires the scipy library but it was not found in your environment. You can install it with pip:
-`pip install scipy`
-"""
-
-
-# docstyle-ignore
-SPEECH_IMPORT_ERROR = """
-{0} requires the torchaudio library but it was not found in your environment. You can install it with pip:
-`pip install torchaudio`
-"""
-
-# docstyle-ignore
-TIMM_IMPORT_ERROR = """
-{0} requires the timm library but it was not found in your environment. You can install it with pip:
-`pip install timm`
-"""
-
-# docstyle-ignore
-VISION_IMPORT_ERROR = """
-{0} requires the PIL library but it was not found in your environment. You can install it with pip:
-`pip install pillow`
-"""
-
-
-# docstyle-ignore
-PYTESSERACT_IMPORT_ERROR = """
-{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
-`pip install pytesseract`
-"""
+File utilities: utilities related to download and cache models
 
 This module should not be update anymore and is only left for backward compatibility.
 """

From 37044b534a5a75e9d8599c98795117b3bb067f69 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 14 Apr 2022 20:25:59 -0700
Subject: [PATCH 14/17] fix merge; fix + extend new test

---
 src/transformers/utils/__init__.py     |  1 +
 src/transformers/utils/import_utils.py |  4 ++
 tests/extended/test_trainer_ext.py     | 74 +++++++++++++++++++++-----
 tests/trainer/test_trainer.py          |  5 +-
 4 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 45364fb8fd335f..9ca406b6c70ae9 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -85,6 +85,7 @@
     DummyObject,
     _LazyModule,
     is_apex_available,
+    is_bnb_available,
     is_coloredlogs_available,
     is_datasets_available,
     is_detectron2_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 6207d0df7ceaa6..55bc63be52ab2a 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -400,6 +400,10 @@ def is_apex_available():
     return importlib.util.find_spec("apex") is not None
 
 
+def is_bnb_available():
+    return importlib.util.find_spec("bitsandbytes") is not None
+
+
 def is_faiss_available():
     return _faiss_available
 
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index 46303fec6c6847..e72a202783f12d 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -21,7 +21,7 @@
 from unittest.mock import patch
 
 from parameterized import parameterized
-from transformers.file_utils import is_apex_available, is_bnb_available
+from transformers import AutoModel
 from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
     CaptureStderr,
@@ -38,7 +38,7 @@
 )
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import set_seed
-from transformers.utils import is_apex_available
+from transformers.utils import is_apex_available, is_bnb_available
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
@@ -206,7 +206,7 @@ def test_trainer_log_level_replica(self, experiment_id):
         self.assertEqual(n_matches, data["n_matches"])
 
     @slow
-    def test_run_seq2seq_slow(self):
+    def test_run_seq2seq(self):
         output_dir = self.run_trainer(
             eval_steps=2,
             max_len=128,
@@ -233,14 +233,14 @@ def test_run_seq2seq_slow(self):
 
     @slow
     @require_bnb
-    def test_run_seq2seq_bnb_slow(self):
+    def test_run_seq2seq_bnb(self):
         from transformers.training_args import OptimizerNames
 
         def train_and_return_metrics(optim: str) -> Tuple[int, float]:
             from pathlib import Path
 
             extra_args = (
-                "--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
+                f"--skip_memory_metrics 0 --optim {optim} --do_eval False --do_predict "
                 "False --adafactor False --log_level debug"
             )
 
@@ -250,23 +250,69 @@ def train_and_return_metrics(optim: str) -> Tuple[int, float]:
                 model_name=MARIAN_MODEL,
                 learning_rate=3e-4,
                 num_train_epochs=1,
-                distributed=False,
-                extra_args_str=extra_args.format(optim=optim),
+                distributed=True,  # force run in a new process
+                extra_args_str=extra_args,
                 do_eval=False,
                 do_predict=False,
             )
 
             # Check metrics
             logs = TrainerState.load_from_json(Path(output_dir, "trainer_state.json")).log_history
-            gpu_peak_memory = logs[0]["train_mem_gpu_peaked_delta"]
+            gpu_peak_mem = logs[0]["train_mem_gpu_peaked_delta"]
+            gpu_alloc_mem = logs[0]["train_mem_gpu_alloc_delta"]
+
             loss = logs[0]["train_loss"]
-            return gpu_peak_memory, loss
+            return gpu_peak_mem, gpu_alloc_mem, loss
+
+        gpu_peak_mem_orig, gpu_alloc_mem_orig, loss_orig = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
+        gpu_peak_mem_bnb, gpu_alloc_mem_bnb, loss_bnb = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
+
+        gpu_peak_mem_diff_bytes = gpu_peak_mem_orig - gpu_peak_mem_bnb
+        gpu_peak_mem_diff_percent = gpu_peak_mem_diff_bytes / gpu_peak_mem_bnb
+
+        gpu_total_mem_orig = gpu_peak_mem_orig + gpu_alloc_mem_orig
+        gpu_total_mem_bnb = gpu_peak_mem_bnb + gpu_alloc_mem_bnb
+
+        gpu_total_mem_diff_bytes = gpu_total_mem_orig - gpu_total_mem_bnb
+        gpu_total_mem_diff_percent = gpu_total_mem_diff_bytes / gpu_total_mem_bnb
+
+        # leave this for now if CI gets very different results
+        # print(f"{gpu_alloc_mem_orig=:010d} {gpu_peak_mem_orig=:010d} {gpu_alloc_mem_orig+gpu_peak_mem_orig=:010d}" )
+        # print(f" {gpu_alloc_mem_bnb=:010d}  {gpu_peak_mem_bnb=:010d}   {gpu_alloc_mem_bnb+gpu_peak_mem_bnb=:010d}")
+        # print(f"{gpu_peak_mem_diff_bytes=}, {gpu_peak_mem_diff_percent=}")
+        # print(f"{gpu_total_mem_orig=}, {gpu_total_mem_bnb=}")
+        # print(f"{gpu_total_mem_diff_bytes=}, {gpu_total_mem_diff_percent=}")
 
-        original_gpu_peak_memory, original_loss = train_and_return_metrics(OptimizerNames.ADAMW_TORCH.value)
-        bnb_gpu_peak_memory, bnb_loss = train_and_return_metrics(OptimizerNames.ADAMW_BNB.value)
+        self.assertGreater(
+            gpu_peak_mem_diff_percent,
+            10,  # basically a huge difference - got ~30x on my desktop
+            "should use very little peak gpu memory with BNB, compared to without it"
+            f"but got gpu_peak_mem_orig={gpu_peak_mem_orig} and gpu_peak_mem_bnb={gpu_peak_mem_bnb}",
+        )
+
+        self.assertGreater(
+            gpu_total_mem_diff_percent,
+            0.20,  # could easily be 0.50, but let's stay on the safe side
+            "Using BNB should use less total GPU memory than without it"
+            f"but got gpu_total_mem_orig={gpu_total_mem_orig} and gpu_total_mem_bnb={gpu_total_mem_bnb}",
+        )
 
-        assert original_gpu_peak_memory < bnb_gpu_peak_memory
-        assert original_loss == bnb_loss
+        self.assertEqual(
+            loss_orig, loss_bnb, "loss should be the same, but got loss_orig={loss_orig}, loss_bnb={loss_bnb}"
+        )
+
+        # Additionally let's test that the absolute gpu memory difference is larger or about the
+        # same as the expected saving coming from BNB (6 bytes per param)
+        model = AutoModel.from_pretrained(MARIAN_MODEL)
+        total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
+        bnb_saved_bytes = total_numel * 6  # 324MB
+        print(f"{bnb_saved_bytes=}")
+
+        self.assertGreater(
+            gpu_total_mem_diff_bytes,
+            bnb_saved_bytes * 0.8,  # add a safety margin, if it saved slightly less
+            f"BNB should have saved about {bnb_saved_bytes} bytes, but the saved bytes were {gpu_total_mem_diff_bytes}",
+        )
 
     def run_trainer(
         self,
@@ -350,6 +396,8 @@ def run_trainer(
                 {self.examples_dir_str}/pytorch/translation/run_translation.py
             """.split()
             cmd = [sys.executable] + distributed_args + args
+            # keep for quick debug
+            # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
             execute_subprocess_async(cmd, env=self.get_env())
         else:
             testargs = ["run_translation.py"] + args
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 141587f78b562a..9f45d51d745f39 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -39,7 +39,6 @@
     is_torch_available,
     logging,
 )
-from transformers.file_utils import WEIGHTS_NAME, is_apex_available, is_bnb_available
 from transformers.testing_utils import (
     ENDPOINT_STAGING,
     PASS,
@@ -66,7 +65,7 @@
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.training_args import OptimizerNames
-from transformers.utils import WEIGHTS_NAME, is_apex_available
+from transformers.utils import WEIGHTS_NAME, is_apex_available, is_bnb_available
 from transformers.utils.hp_naming import TrialShortNamer
 
 
@@ -1871,6 +1870,7 @@ def hp_name(trial):
             },
         ),
     ]
+
     if is_apex_available():
         import apex
 
@@ -1881,6 +1881,7 @@ def hp_name(trial):
                 default_adam_kwargs,
             )
         )
+
     if is_bnb_available():
         import bitsandbytes as bnb
 

From 2e36019a38254b8c1410b5f47db507765976a086 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Thu, 14 Apr 2022 20:29:22 -0700
Subject: [PATCH 15/17] cleanup

---
 tests/extended/test_trainer_ext.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index e72a202783f12d..ba7168e2cd5891 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -306,7 +306,6 @@ def train_and_return_metrics(optim: str) -> Tuple[int, float]:
         model = AutoModel.from_pretrained(MARIAN_MODEL)
         total_numel = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
         bnb_saved_bytes = total_numel * 6  # 324MB
-        print(f"{bnb_saved_bytes=}")
 
         self.assertGreater(
             gpu_total_mem_diff_bytes,

From dbdaf25cf75270ef4c6b77f51488fbac0b7e59b7 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Fri, 15 Apr 2022 08:16:07 -0700
Subject: [PATCH 16/17] expand bnb

---
 src/transformers/utils/__init__.py     | 2 +-
 src/transformers/utils/import_utils.py | 2 +-
 tests/extended/test_trainer_ext.py     | 8 ++++----
 tests/trainer/test_trainer.py          | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 9ca406b6c70ae9..6101a924f969a0 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -85,7 +85,7 @@
     DummyObject,
     _LazyModule,
     is_apex_available,
-    is_bnb_available,
+    is_bitsandbytes_available,
     is_coloredlogs_available,
     is_datasets_available,
     is_detectron2_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 55bc63be52ab2a..505ba94e0b193c 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -400,7 +400,7 @@ def is_apex_available():
     return importlib.util.find_spec("apex") is not None
 
 
-def is_bnb_available():
+def is_bitsandbytes_available():
     return importlib.util.find_spec("bitsandbytes") is not None
 
 
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index ba7168e2cd5891..d3d112adb00e52 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -38,7 +38,7 @@
 )
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import set_seed
-from transformers.utils import is_apex_available, is_bnb_available
+from transformers.utils import is_apex_available, is_bitsandbytes_available
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
@@ -74,11 +74,11 @@ def require_apex(test_case):
 
 
 # a candidate for testing_utils
-def require_bnb(test_case):
+def require_bitsandbytes(test_case):
     """
     Decorator for bits and bytes (bnb) dependency
     """
-    if not is_bnb_available():
+    if not is_bitsandbytes_available():
         return unittest.skip("test requires bnb")(test_case)
     else:
         return test_case
@@ -232,7 +232,7 @@ def test_run_seq2seq(self):
         assert "predict_results.json" in contents
 
     @slow
-    @require_bnb
+    @require_bitsandbytes
     def test_run_seq2seq_bnb(self):
         from transformers.training_args import OptimizerNames
 
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 9f45d51d745f39..5fee6d8e3f7be4 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -65,7 +65,7 @@
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 from transformers.training_args import OptimizerNames
-from transformers.utils import WEIGHTS_NAME, is_apex_available, is_bnb_available
+from transformers.utils import WEIGHTS_NAME, is_apex_available, is_bitsandbytes_available
 from transformers.utils.hp_naming import TrialShortNamer
 
 
@@ -1882,7 +1882,7 @@ def hp_name(trial):
             )
         )
 
-    if is_bnb_available():
+    if is_bitsandbytes_available():
         import bitsandbytes as bnb
 
         optim_test_params.append(

From ce2c550497b690e4038498817aa1fce011edfd7c Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas@stason.org>
Date: Tue, 19 Apr 2022 10:46:06 -0700
Subject: [PATCH 17/17] move all require_* candidates to testing_utils.py

---
 src/transformers/testing_utils.py  | 40 +++++++++++++++++++++++++++++-
 tests/extended/test_trainer_ext.py | 38 +++-------------------------
 2 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index b60c7942097a14..36f56d2eeb29c6 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -31,8 +31,16 @@
 from transformers import logging as transformers_logging
 
 from .deepspeed import is_deepspeed_available
-from .integrations import is_optuna_available, is_ray_available, is_sigopt_available, is_wandb_available
+from .integrations import (
+    is_fairscale_available,
+    is_optuna_available,
+    is_ray_available,
+    is_sigopt_available,
+    is_wandb_available,
+)
 from .utils import (
+    is_apex_available,
+    is_bitsandbytes_available,
     is_detectron2_available,
     is_faiss_available,
     is_flax_available,
@@ -638,6 +646,36 @@ def require_deepspeed(test_case):
         return test_case
 
 
+def require_fairscale(test_case):
+    """
+    Decorator marking a test that requires fairscale
+    """
+    if not is_fairscale_available():
+        return unittest.skip("test requires fairscale")(test_case)
+    else:
+        return test_case
+
+
+def require_apex(test_case):
+    """
+    Decorator marking a test that requires apex
+    """
+    if not is_apex_available():
+        return unittest.skip("test requires apex")(test_case)
+    else:
+        return test_case
+
+
+def require_bitsandbytes(test_case):
+    """
+    Decorator for bits and bytes (bnb) dependency
+    """
+    if not is_bitsandbytes_available():
+        return unittest.skip("test requires bnb")(test_case)
+    else:
+        return test_case
+
+
 def require_phonemizer(test_case):
     """
     Decorator marking a test that requires phonemizer
diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py
index d3d112adb00e52..af8c5d4dd785de 100644
--- a/tests/extended/test_trainer_ext.py
+++ b/tests/extended/test_trainer_ext.py
@@ -22,7 +22,6 @@
 
 from parameterized import parameterized
 from transformers import AutoModel
-from transformers.integrations import is_fairscale_available
 from transformers.testing_utils import (
     CaptureStderr,
     ExtendSysPath,
@@ -30,6 +29,9 @@
     execute_subprocess_async,
     get_gpu_count,
     get_torch_dist_unique_port,
+    require_apex,
+    require_bitsandbytes,
+    require_fairscale,
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
@@ -38,7 +40,6 @@
 )
 from transformers.trainer_callback import TrainerState
 from transformers.trainer_utils import set_seed
-from transformers.utils import is_apex_available, is_bitsandbytes_available
 
 
 bindir = os.path.abspath(os.path.dirname(__file__))
@@ -51,39 +52,6 @@
 MBART_TINY = "sshleifer/tiny-mbart"
 
 
-# a candidate for testing_utils
-def require_fairscale(test_case):
-    """
-    Decorator marking a test that requires fairscale
-    """
-    if not is_fairscale_available():
-        return unittest.skip("test requires fairscale")(test_case)
-    else:
-        return test_case
-
-
-# a candidate for testing_utils
-def require_apex(test_case):
-    """
-    Decorator marking a test that requires apex
-    """
-    if not is_apex_available():
-        return unittest.skip("test requires apex")(test_case)
-    else:
-        return test_case
-
-
-# a candidate for testing_utils
-def require_bitsandbytes(test_case):
-    """
-    Decorator for bits and bytes (bnb) dependency
-    """
-    if not is_bitsandbytes_available():
-        return unittest.skip("test requires bnb")(test_case)
-    else:
-        return test_case
-
-
 @require_torch
 class TestTrainerExt(TestCasePlus):
     def run_seq2seq_quick(