From cf5522416e949147bbd0f5e51a8a128bc450d41e Mon Sep 17 00:00:00 2001 From: Dhruv Date: Sat, 6 Aug 2022 21:35:50 +0530 Subject: [PATCH 01/19] onnx config for clip --- src/transformers/models/clip/__init__.py | 4 +- .../models/clip/configuration_clip.py | 39 ++++++++++++++++++- src/transformers/onnx/features.py | 4 ++ tests/onnx/test_onnx_v2.py | 1 + 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index 6a6e64c995d385..6a1bdaca96c5da 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -29,7 +29,7 @@ _import_structure = { - "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"], + "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig", "CLIPOnnxConfig"], "tokenization_clip": ["CLIPTokenizer"], } @@ -95,7 +95,7 @@ if TYPE_CHECKING: - from .configuration_clip import CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, CLIPTextConfig, CLIPVisionConfig + from .configuration_clip import CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, CLIPTextConfig, CLIPVisionConfig, CLIPOnnxConfig from .tokenization_clip import CLIPTokenizer try: diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 3bb22b74a4c77c..d7b4ec43273053 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -16,8 +16,15 @@ import copy import os -from typing import Union +from collections import OrderedDict +from typing import Any, Mapping, Union, Optional + +from transformers import TensorType +from transformers.processing_utils import ProcessorMixin + +from ... import is_torch_available +from ...onnx import OnnxConfig from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -317,3 +324,33 @@ def to_dict(self): output["vision_config"] = self.vision_config.to_dict() output["model_type"] = self.__class__.model_type return output + + +class CLIPOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch"}) + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: ProcessorMixin, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + + text_input_dict = super().generate_dummy_inputs( + processor.tokenizer, framework=framework + ) + image_input_dict = super().generate_dummy_inputs( + processor.feature_extractor, framework=framework + ) + return {**text_input_dict, **image_input_dict} \ No newline at end of file diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py index e7c24a8ad97a81..44b533c0241af4 100644 --- a/src/transformers/onnx/features.py +++ b/src/transformers/onnx/features.py @@ -201,6 +201,10 @@ class FeaturesManager: "question-answering", onnx_config_cls="models.camembert.CamembertOnnxConfig", ), + "clip": supported_features_mapping( + "default", + onnx_config_cls="models.clip.CLIPOnnxConfig", + ), "codegen": supported_features_mapping( "default", "causal-lm", diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index c15910734f3adb..5b0469803d5e9c 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -185,6 +185,7 @@ def test_values_override(self): ("big-bird", "google/bigbird-roberta-base"), ("ibert", "kssteven/ibert-roberta-base"), ("camembert", "camembert-base"), + ("clip", "openai/clip-vit-base-patch32"), ("convbert", "YituTech/conv-bert-base"), ("codegen", "Salesforce/codegen-350M-multi"), ("deberta", "microsoft/deberta-base"), From 8418aee81232b1a713187cd3e757a0f5b3525b72 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Sat, 6 Aug 2022 21:46:58 +0530 Subject: [PATCH 02/19] default opset as 14 --- src/transformers/models/clip/configuration_clip.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index d7b4ec43273053..19fd4978461999 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -353,4 +353,8 @@ def generate_dummy_inputs( image_input_dict = super().generate_dummy_inputs( processor.feature_extractor, framework=framework ) - return {**text_input_dict, **image_input_dict} \ No newline at end of file + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 \ No newline at end of file From eae965e072219e6672e11823b0554e085a372580 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 14:09:52 +0530 Subject: [PATCH 03/19] changes from the original repo --- src/transformers/models/clip/modeling_clip.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ddc2236371c29a..dce83faba7fabf 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -630,6 +630,7 @@ def forward( if input_ids is None: raise ValueError("You have to specify either input_ids") + input_ids = input_ids.to(torch.int) # for onnx compatibility, since onnx doesn't support int64 input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) @@ -1044,8 +1045,8 @@ def forward( text_embeds = self.text_projection(text_embeds) # normalized features - image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + image_embeds = image_embeds / image_embeds.norm(p=2, dim=1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=1, keepdim=True) # cosine similarity as logits logit_scale = self.logit_scale.exp() From 118ca35874d479ddcc7909a88d584e8d09d1014c Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 22:14:40 +0530 Subject: [PATCH 04/19] input values order fix --- src/transformers/models/clip/configuration_clip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 19fd4978461999..5e8ca7576ec2b4 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -332,8 +332,8 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: return OrderedDict( [ ("input_ids", {0: "batch", 1: "sequence"}), - ("attention_mask", {0: "batch", 1: "sequence"}), - ("pixel_values", {0: "batch"}) + ("pixel_values", {0: "batch"}), + ("attention_mask", {0: "batch", 1: "sequence"}) ] ) From 4a22e429813710c4aae5d7832e5ca7266b26b04a Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 23:00:24 +0530 Subject: [PATCH 05/19] outputs fix --- src/transformers/models/clip/configuration_clip.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 5e8ca7576ec2b4..9ef128a29e2b8c 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -336,6 +336,17 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: ("attention_mask", {0: "batch", 1: "sequence"}) ] ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}) + ] + ) @property def atol_for_validation(self) -> float: From b10822435b95b93d91898db5b6d4c083ba7ebae3 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 23:06:56 +0530 Subject: [PATCH 06/19] remove unused import --- src/transformers/models/clip/configuration_clip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 9ef128a29e2b8c..ceaee3d87ff5aa 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -23,7 +23,6 @@ from transformers import TensorType from transformers.processing_utils import ProcessorMixin -from ... import is_torch_available from ...onnx import OnnxConfig from ...configuration_utils import PretrainedConfig from ...utils import logging From 3a8d870ba49a6ee94f6c56fdf2b0799b44bafef9 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 23:28:43 +0530 Subject: [PATCH 07/19] ran make fix-copies --- docs/source/en/serialization.mdx | 1 + .../models/big_bird/modeling_flax_big_bird.py | 1 - .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 2 -- .../models/data2vec/modeling_data2vec_audio.py | 1 - .../models/data2vec/modeling_data2vec_text.py | 1 - .../models/data2vec/modeling_data2vec_vision.py | 1 - .../models/deberta_v2/modeling_tf_deberta_v2.py | 2 -- src/transformers/models/deit/modeling_deit.py | 2 -- src/transformers/models/dpt/modeling_dpt.py | 2 -- src/transformers/models/electra/modeling_electra.py | 1 - src/transformers/models/groupvit/modeling_groupvit.py | 1 + src/transformers/models/hubert/modeling_hubert.py | 1 - src/transformers/models/hubert/modeling_tf_hubert.py | 9 --------- src/transformers/models/layoutlm/modeling_layoutlm.py | 1 - .../models/layoutlmv3/tokenization_layoutlmv3_fast.py | 1 - src/transformers/models/longt5/modeling_longt5.py | 1 - .../models/mobilebert/modeling_mobilebert.py | 2 -- src/transformers/models/nezha/modeling_nezha.py | 1 - src/transformers/models/plbart/modeling_plbart.py | 2 -- src/transformers/models/realm/modeling_realm.py | 1 - src/transformers/models/roberta/modeling_roberta.py | 1 - src/transformers/models/sew/modeling_sew.py | 1 - src/transformers/models/sew_d/modeling_sew_d.py | 2 -- src/transformers/models/splinter/modeling_splinter.py | 1 - src/transformers/models/unispeech/modeling_unispeech.py | 1 - .../models/unispeech_sat/modeling_unispeech_sat.py | 1 - src/transformers/models/videomae/modeling_videomae.py | 2 -- src/transformers/models/vilt/modeling_vilt.py | 2 -- src/transformers/models/vit_mae/modeling_vit_mae.py | 2 -- .../wav2vec2_conformer/modeling_wav2vec2_conformer.py | 1 - src/transformers/models/wavlm/modeling_wavlm.py | 1 - src/transformers/models/yolos/modeling_yolos.py | 2 -- 32 files changed, 2 insertions(+), 49 deletions(-) diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx index e41ccae949e8bb..bb6a2d2a610b0e 100644 --- a/docs/source/en/serialization.mdx +++ b/docs/source/en/serialization.mdx @@ -55,6 +55,7 @@ Ready-made configurations include the following architectures: - BlenderbotSmall - BLOOM - CamemBERT +- CLIP - CodeGen - ConvBERT - ConvNeXT diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index 2e3192ff0eeb02..0e19870172e4cf 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -1862,7 +1862,6 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): - # Model outputs = self.bert( input_ids, diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index ce5040e92c7fa3..56253f802a8d0d 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -365,7 +365,6 @@ def bigbird_block_sparse_attention( plan_num_rand_blocks, output_attentions, ): - # BigBirdPegasus block-sparse attention as suggested in paper # ITC: @@ -2398,7 +2397,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: - # different to other models, BigBirdPegasus automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 70d802a80154b8..9fe378aee88f18 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1059,7 +1059,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 8a7d6308bf5744..cbc6853a8ba49a 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -503,7 +503,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index e63ee0d32cf187..3764061c1ce451 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -151,7 +151,6 @@ def __init__(self, config: Data2VecVisionConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor: - embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index aabb3b2d380ea1..ae6ec55117fd91 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -94,7 +94,6 @@ def __init__(self, axis=-1, **kwargs): self.axis = axis def call(self, inputs: tf.Tensor, mask: tf.Tensor): - rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) output = stable_softmax(output, self.axis) @@ -1021,7 +1020,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: - if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index 8f8307499fa479..b4c5a0e97e8ee6 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -205,7 +205,6 @@ def __init__(self, config: DeiTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -263,7 +262,6 @@ def __init__(self, config: DeiTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 7dfa244805ff77..16a64756b91050 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -225,7 +225,6 @@ def __init__(self, config: DPTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -284,7 +283,6 @@ def __init__(self, config: DPTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index c215256b3e5f4e..c6b426ff6d708c 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -564,7 +564,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 1073d4bfea8708..16644c80e3e7e2 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1102,6 +1102,7 @@ def forward( if input_ids is None: raise ValueError("You have to specify either input_ids") + input_ids = input_ids.to(torch.int) # for onnx compatibility, since onnx doesn't support int64 input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index d6cb6b8e059920..cb7918b9fec90e 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1174,7 +1174,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index f078b5d0cfc7dd..f471904c342e28 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -314,7 +314,6 @@ def __init__( self._check_axis() def build(self, input_shape): - self._check_if_input_shape_is_none(input_shape) self._set_number_of_groups_for_instance_norm(input_shape) self._check_size_of_dimensions(input_shape) @@ -326,7 +325,6 @@ def build(self, input_shape): super().build(input_shape) def call(self, inputs): - input_shape = tf.keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) @@ -363,7 +361,6 @@ def compute_output_shape(self, input_shape): return input_shape def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): - group_shape = [tensor_input_shape[i] for i in range(len(input_shape))] is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -376,7 +373,6 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): - group_shape = tf.keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 @@ -428,7 +424,6 @@ def _set_number_of_groups_for_instance_norm(self, input_shape): self.groups = dim def _check_size_of_dimensions(self, input_shape): - dim = input_shape[self.axis] if dim < self.groups: raise ValueError( @@ -449,19 +444,16 @@ def _check_size_of_dimensions(self, input_shape): ) def _check_axis(self): - if self.axis == 0: raise ValueError( "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead" ) def _create_input_spec(self, input_shape): - dim = input_shape[self.axis] self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) @@ -477,7 +469,6 @@ def _add_gamma_weight(self, input_shape): self.gamma = None def _add_beta_weight(self, input_shape): - dim = input_shape[self.axis] shape = (dim,) diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index e3a625416a7d1b..0fd07a7a20b381 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -471,7 +471,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index be5f938dbf17ce..d18dcbd4d5c6a8 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -534,7 +534,6 @@ def _batch_encode_plus( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: - if not isinstance(batch_text_or_text_pairs, list): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index abd1cb778655a6..862d0f32e64b3e 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -232,7 +232,6 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): - # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 6bc306a6e05eb3..1c29ec9e3adac6 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -1342,7 +1342,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1548,7 +1547,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing class MobileBertForTokenClassification(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 4fa38b3ed48f09..06a2dbbea83eef 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -579,7 +579,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index eb8b5d2b41671c..6baf1512b3d003 100755 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1041,7 +1041,6 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1066,7 +1065,6 @@ def custom_forward(*inputs): None, ) else: - layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 6ee2b1fd14b402..95013ffc39ef97 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -579,7 +579,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 46add0be500195..7c4d4193662609 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -503,7 +503,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 632f7d4880f198..2652d3581e057b 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -1054,7 +1054,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index a9a231aec1d8e6..5cdbdfc0c2129a 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1118,7 +1118,6 @@ def forward( rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): - if output_hidden_states: all_hidden_states = all_hidden_states + (output_states,) @@ -1582,7 +1581,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 1f94f6f9ad273d..9324f1b3bfd311 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -444,7 +444,6 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: - if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index dc194318e9992d..9fed0a0898e946 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1424,7 +1424,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 926464d3bf8e8b..47a22a46928b6c 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1428,7 +1428,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index a807ed7208fccb..133812d2324d2e 100644 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -286,7 +286,6 @@ def __init__(self, config: VideoMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -344,7 +343,6 @@ def __init__(self, config: VideoMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 308358850c9808..ebdd3882ab0acd 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -400,7 +400,6 @@ def __init__(self, config: ViltConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -452,7 +451,6 @@ def __init__(self, config: ViltConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 0667bdd73c5545..30bd2abc25c637 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -388,7 +388,6 @@ def __init__(self, config: ViTMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -446,7 +445,6 @@ def __init__(self, config: ViTMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 4c4962b155c35c..fe279f68ffd2b8 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1670,7 +1670,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index c792a368cb4790..0a05aa5560e41a 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1361,7 +1361,6 @@ def forward( loss = None if labels is not None: - if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 447cec23de97fe..bb7384b9e5d030 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -323,7 +323,6 @@ def __init__(self, config: YolosConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -381,7 +380,6 @@ def __init__(self, config: YolosConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) From bfed078393121bcfab1649e4a08f7f5982d7a883 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Mon, 8 Aug 2022 23:36:36 +0530 Subject: [PATCH 08/19] black format --- src/transformers/models/clip/__init__.py | 16 ++++++++++++-- .../models/clip/configuration_clip.py | 22 ++++++++----------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index 6a1bdaca96c5da..c22e1b8df97a01 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -29,7 +29,13 @@ _import_structure = { - "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig", "CLIPOnnxConfig"], + "configuration_clip": [ + "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", + "CLIPConfig", + "CLIPTextConfig", + "CLIPVisionConfig", + "CLIPOnnxConfig", + ], "tokenization_clip": ["CLIPTokenizer"], } @@ -95,7 +101,13 @@ if TYPE_CHECKING: - from .configuration_clip import CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, CLIPTextConfig, CLIPVisionConfig, CLIPOnnxConfig + from .configuration_clip import ( + CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, + CLIPConfig, + CLIPTextConfig, + CLIPVisionConfig, + CLIPOnnxConfig, + ) from .tokenization_clip import CLIPTokenizer try: diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index ceaee3d87ff5aa..0960e154c7099a 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -324,18 +324,18 @@ def to_dict(self): output["model_type"] = self.__class__.model_type return output - + class CLIPOnnxConfig(OnnxConfig): @property def inputs(self) -> Mapping[str, Mapping[int, str]]: return OrderedDict( [ - ("input_ids", {0: "batch", 1: "sequence"}), + ("input_ids", {0: "batch", 1: "sequence"}), ("pixel_values", {0: "batch"}), - ("attention_mask", {0: "batch", 1: "sequence"}) + ("attention_mask", {0: "batch", 1: "sequence"}), ] ) - + @property def outputs(self) -> Mapping[str, Mapping[int, str]]: return OrderedDict( @@ -343,7 +343,7 @@ def outputs(self) -> Mapping[str, Mapping[int, str]]: ("logits_per_image", {0: "batch"}), ("logits_per_text", {0: "batch"}), ("text_embeds", {0: "batch"}), - ("image_embeds", {0: "batch"}) + ("image_embeds", {0: "batch"}), ] ) @@ -357,14 +357,10 @@ def generate_dummy_inputs( framework: Optional[TensorType] = None, ) -> Mapping[str, Any]: - text_input_dict = super().generate_dummy_inputs( - processor.tokenizer, framework=framework - ) - image_input_dict = super().generate_dummy_inputs( - processor.feature_extractor, framework=framework - ) + text_input_dict = super().generate_dummy_inputs(processor.tokenizer, framework=framework) + image_input_dict = super().generate_dummy_inputs(processor.feature_extractor, framework=framework) return {**text_input_dict, **image_input_dict} - + @property def default_onnx_opset(self) -> int: - return 14 \ No newline at end of file + return 14 From 2306fbc387c174e2f1dc5d98130fa9e41c9fae4a Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 02:00:34 +0530 Subject: [PATCH 09/19] review comments: forward ref, import fix, model change revert, .to cleanup --- src/transformers/models/clip/configuration_clip.py | 11 ++++++----- src/transformers/models/clip/modeling_clip.py | 7 +++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 0960e154c7099a..14ce960c289901 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -18,10 +18,11 @@ import os from collections import OrderedDict -from typing import Any, Mapping, Union, Optional +from typing import TYPE_CHECKING, Any, Mapping, Union, Optional -from transformers import TensorType -from transformers.processing_utils import ProcessorMixin +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType from ...onnx import OnnxConfig from ...configuration_utils import PretrainedConfig @@ -353,8 +354,8 @@ def atol_for_validation(self) -> float: def generate_dummy_inputs( self, - processor: ProcessorMixin, - framework: Optional[TensorType] = None, + processor: "ProcessorMixin", + framework: Optional["TensorType"] = None, ) -> Mapping[str, Any]: text_input_dict = super().generate_dummy_inputs(processor.tokenizer, framework=framework) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index dce83faba7fabf..aa8bf788f55b1a 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -630,7 +630,6 @@ def forward( if input_ids is None: raise ValueError("You have to specify either input_ids") - input_ids = input_ids.to(torch.int) # for onnx compatibility, since onnx doesn't support int64 input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) @@ -661,7 +660,7 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] @@ -1045,8 +1044,8 @@ def forward( text_embeds = self.text_projection(text_embeds) # normalized features - image_embeds = image_embeds / image_embeds.norm(p=2, dim=1, keepdim=True) - text_embeds = text_embeds / text_embeds.norm(p=2, dim=1, keepdim=True) + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) # cosine similarity as logits logit_scale = self.logit_scale.exp() From 19e04236b79a8a2430c702eae0d06fe8ee2c34c0 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 02:17:09 +0530 Subject: [PATCH 10/19] make style --- src/transformers/models/clip/__init__.py | 4 ++-- src/transformers/models/clip/configuration_clip.py | 4 ++-- src/transformers/models/clip/modeling_clip.py | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index c22e1b8df97a01..932130f8d5fdf9 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -32,9 +32,9 @@ "configuration_clip": [ "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", + "CLIPOnnxConfig", "CLIPTextConfig", "CLIPVisionConfig", - "CLIPOnnxConfig", ], "tokenization_clip": ["CLIPTokenizer"], } @@ -104,9 +104,9 @@ from .configuration_clip import ( CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, + CLIPOnnxConfig, CLIPTextConfig, CLIPVisionConfig, - CLIPOnnxConfig, ) from .tokenization_clip import CLIPTokenizer diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 14ce960c289901..a118b179e4c09f 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -17,15 +17,15 @@ import copy import os from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Mapping, Optional, Union -from typing import TYPE_CHECKING, Any, Mapping, Union, Optional if TYPE_CHECKING: from ...processing_utils import ProcessorMixin from ...utils import TensorType -from ...onnx import OnnxConfig from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig from ...utils import logging diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index aa8bf788f55b1a..a1052e3970f1fa 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -660,7 +660,9 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1)] + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) + ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] From af3e2fc19764c62e195d330c04eba6cc9ee7cd1b Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 02:49:13 +0530 Subject: [PATCH 11/19] formatting fixes --- .../models/big_bird/modeling_flax_big_bird.py | 1 + .../models/bigbird_pegasus/modeling_bigbird_pegasus.py | 2 ++ .../models/data2vec/modeling_data2vec_audio.py | 1 + .../models/data2vec/modeling_data2vec_text.py | 1 + .../models/data2vec/modeling_data2vec_vision.py | 1 + .../models/deberta_v2/modeling_tf_deberta_v2.py | 2 ++ src/transformers/models/deit/modeling_deit.py | 2 ++ src/transformers/models/dpt/modeling_dpt.py | 2 ++ src/transformers/models/electra/modeling_electra.py | 1 + src/transformers/models/groupvit/modeling_groupvit.py | 5 +++-- src/transformers/models/hubert/modeling_hubert.py | 1 + src/transformers/models/hubert/modeling_tf_hubert.py | 9 +++++++++ src/transformers/models/layoutlm/modeling_layoutlm.py | 1 + .../models/layoutlmv3/tokenization_layoutlmv3_fast.py | 1 + src/transformers/models/longt5/modeling_longt5.py | 1 + .../models/mobilebert/modeling_mobilebert.py | 2 ++ src/transformers/models/nezha/modeling_nezha.py | 1 + src/transformers/models/plbart/modeling_plbart.py | 2 ++ src/transformers/models/realm/modeling_realm.py | 1 + src/transformers/models/roberta/modeling_roberta.py | 1 + src/transformers/models/sew/modeling_sew.py | 1 + src/transformers/models/sew_d/modeling_sew_d.py | 2 ++ src/transformers/models/splinter/modeling_splinter.py | 1 + src/transformers/models/unispeech/modeling_unispeech.py | 1 + .../models/unispeech_sat/modeling_unispeech_sat.py | 1 + src/transformers/models/videomae/modeling_videomae.py | 2 ++ src/transformers/models/vilt/modeling_vilt.py | 2 ++ src/transformers/models/vit_mae/modeling_vit_mae.py | 2 ++ .../wav2vec2_conformer/modeling_wav2vec2_conformer.py | 1 + src/transformers/models/wavlm/modeling_wavlm.py | 1 + src/transformers/models/yolos/modeling_yolos.py | 2 ++ 31 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py index 0e19870172e4cf..2e3192ff0eeb02 100644 --- a/src/transformers/models/big_bird/modeling_flax_big_bird.py +++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py @@ -1862,6 +1862,7 @@ def __call__( output_hidden_states: bool = False, return_dict: bool = True, ): + # Model outputs = self.bert( input_ids, diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 56253f802a8d0d..ce5040e92c7fa3 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -365,6 +365,7 @@ def bigbird_block_sparse_attention( plan_num_rand_blocks, output_attentions, ): + # BigBirdPegasus block-sparse attention as suggested in paper # ITC: @@ -2397,6 +2398,7 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Seq2SeqModelOutput]: + # different to other models, BigBirdPegasus automatically creates decoder_input_ids from # input_ids if no decoder_input_ids are provided if decoder_input_ids is None and decoder_inputs_embeds is None: diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 9fe378aee88f18..70d802a80154b8 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -1059,6 +1059,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index cbc6853a8ba49a..8a7d6308bf5744 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -503,6 +503,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 3764061c1ce451..e63ee0d32cf187 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -151,6 +151,7 @@ def __init__(self, config: Data2VecVisionConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor: + embeddings = self.patch_embeddings(pixel_values) batch_size, seq_len, _ = embeddings.size() diff --git a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py index ae6ec55117fd91..aabb3b2d380ea1 100644 --- a/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_tf_deberta_v2.py @@ -94,6 +94,7 @@ def __init__(self, axis=-1, **kwargs): self.axis = axis def call(self, inputs: tf.Tensor, mask: tf.Tensor): + rmask = tf.logical_not(tf.cast(mask, tf.bool)) output = tf.where(rmask, float("-inf"), inputs) output = stable_softmax(output, self.axis) @@ -1020,6 +1021,7 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index b4c5a0e97e8ee6..8f8307499fa479 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -205,6 +205,7 @@ def __init__(self, config: DeiTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -262,6 +263,7 @@ def __init__(self, config: DeiTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/dpt/modeling_dpt.py b/src/transformers/models/dpt/modeling_dpt.py index 16a64756b91050..7dfa244805ff77 100755 --- a/src/transformers/models/dpt/modeling_dpt.py +++ b/src/transformers/models/dpt/modeling_dpt.py @@ -225,6 +225,7 @@ def __init__(self, config: DPTConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -283,6 +284,7 @@ def __init__(self, config: DPTConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index c6b426ff6d708c..c215256b3e5f4e 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -564,6 +564,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 16644c80e3e7e2..a9faf4b8faf8ff 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1102,7 +1102,6 @@ def forward( if input_ids is None: raise ValueError("You have to specify either input_ids") - input_ids = input_ids.to(torch.int) # for onnx compatibility, since onnx doesn't support int64 input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) @@ -1133,7 +1132,9 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) + ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index cb7918b9fec90e..d6cb6b8e059920 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -1174,6 +1174,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index f471904c342e28..f078b5d0cfc7dd 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -314,6 +314,7 @@ def __init__( self._check_axis() def build(self, input_shape): + self._check_if_input_shape_is_none(input_shape) self._set_number_of_groups_for_instance_norm(input_shape) self._check_size_of_dimensions(input_shape) @@ -325,6 +326,7 @@ def build(self, input_shape): super().build(input_shape) def call(self, inputs): + input_shape = tf.keras.backend.int_shape(inputs) tensor_input_shape = tf.shape(inputs) @@ -361,6 +363,7 @@ def compute_output_shape(self, input_shape): return input_shape def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): + group_shape = [tensor_input_shape[i] for i in range(len(input_shape))] is_instance_norm = (input_shape[self.axis] // self.groups) == 1 if not is_instance_norm: @@ -373,6 +376,7 @@ def _reshape_into_groups(self, inputs, input_shape, tensor_input_shape): return inputs, group_shape def _apply_normalization(self, reshaped_inputs, input_shape): + group_shape = tf.keras.backend.int_shape(reshaped_inputs) group_reduction_axes = list(range(1, len(group_shape))) is_instance_norm = (input_shape[self.axis] // self.groups) == 1 @@ -424,6 +428,7 @@ def _set_number_of_groups_for_instance_norm(self, input_shape): self.groups = dim def _check_size_of_dimensions(self, input_shape): + dim = input_shape[self.axis] if dim < self.groups: raise ValueError( @@ -444,16 +449,19 @@ def _check_size_of_dimensions(self, input_shape): ) def _check_axis(self): + if self.axis == 0: raise ValueError( "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead" ) def _create_input_spec(self, input_shape): + dim = input_shape[self.axis] self.input_spec = tf.keras.layers.InputSpec(ndim=len(input_shape), axes={self.axis: dim}) def _add_gamma_weight(self, input_shape): + dim = input_shape[self.axis] shape = (dim,) @@ -469,6 +477,7 @@ def _add_gamma_weight(self, input_shape): self.gamma = None def _add_beta_weight(self, input_shape): + dim = input_shape[self.axis] shape = (dim,) diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 0fd07a7a20b381..e3a625416a7d1b 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -471,6 +471,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py index d18dcbd4d5c6a8..be5f938dbf17ce 100644 --- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py +++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -534,6 +534,7 @@ def _batch_encode_plus( return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: + if not isinstance(batch_text_or_text_pairs, list): raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})") diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 862d0f32e64b3e..abd1cb778655a6 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -232,6 +232,7 @@ def __init__(self, hidden_size, eps=1e-6): self.variance_epsilon = eps def forward(self, hidden_states): + # LongT5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 1c29ec9e3adac6..6bc306a6e05eb3 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -1342,6 +1342,7 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): @@ -1547,6 +1548,7 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing class MobileBertForTokenClassification(MobileBertPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] def __init__(self, config): diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 06a2dbbea83eef..4fa38b3ed48f09 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -579,6 +579,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 6baf1512b3d003..eb8b5d2b41671c 100755 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1041,6 +1041,7 @@ def forward( past_key_value = past_key_values[idx] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." @@ -1065,6 +1066,7 @@ def custom_forward(*inputs): None, ) else: + layer_outputs = decoder_layer( hidden_states, attention_mask=attention_mask, diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index 95013ffc39ef97..6ee2b1fd14b402 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -579,6 +579,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 7c4d4193662609..46add0be500195 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -503,6 +503,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 2652d3581e057b..632f7d4880f198 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -1054,6 +1054,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 5cdbdfc0c2129a..a9a231aec1d8e6 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1118,6 +1118,7 @@ def forward( rel_embeddings = self.get_rel_embedding() output_states = next_kv for i, layer_module in enumerate(self.layer): + if output_hidden_states: all_hidden_states = all_hidden_states + (output_states,) @@ -1581,6 +1582,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 9324f1b3bfd311..1f94f6f9ad273d 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -444,6 +444,7 @@ def forward( past_key_value = past_key_values[i] if past_key_values is not None else None if self.gradient_checkpointing and self.training: + if use_cache: logger.warning( "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 9fed0a0898e946..dc194318e9992d 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1424,6 +1424,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 47a22a46928b6c..926464d3bf8e8b 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1428,6 +1428,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py index 133812d2324d2e..a807ed7208fccb 100644 --- a/src/transformers/models/videomae/modeling_videomae.py +++ b/src/transformers/models/videomae/modeling_videomae.py @@ -286,6 +286,7 @@ def __init__(self, config: VideoMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -343,6 +344,7 @@ def __init__(self, config: VideoMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index ebdd3882ab0acd..308358850c9808 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -400,6 +400,7 @@ def __init__(self, config: ViltConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -451,6 +452,7 @@ def __init__(self, config: ViltConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/vit_mae/modeling_vit_mae.py b/src/transformers/models/vit_mae/modeling_vit_mae.py index 30bd2abc25c637..0667bdd73c5545 100755 --- a/src/transformers/models/vit_mae/modeling_vit_mae.py +++ b/src/transformers/models/vit_mae/modeling_vit_mae.py @@ -388,6 +388,7 @@ def __init__(self, config: ViTMAEConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -445,6 +446,7 @@ def __init__(self, config: ViTMAEConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index fe279f68ffd2b8..4c4962b155c35c 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1670,6 +1670,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 0a05aa5560e41a..c792a368cb4790 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -1361,6 +1361,7 @@ def forward( loss = None if labels is not None: + if labels.max() >= self.config.vocab_size: raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}") diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index bb7384b9e5d030..447cec23de97fe 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -323,6 +323,7 @@ def __init__(self, config: YolosConfig) -> None: self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) @@ -380,6 +381,7 @@ def __init__(self, config: YolosConfig) -> None: self.intermediate_act_fn = config.hidden_act def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) hidden_states = self.intermediate_act_fn(hidden_states) From 32295b54d9d4a1e08ae7f458900f6113fecc4c1f Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 09:08:47 +0530 Subject: [PATCH 12/19] revert groupvit --- src/transformers/models/groupvit/modeling_groupvit.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index a9faf4b8faf8ff..1073d4bfea8708 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -1132,9 +1132,7 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[ - torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) - ] + pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] From 3737ec2393e0481f770cfba0b62764ddef4df5f7 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 15:34:40 +0530 Subject: [PATCH 13/19] comment for cast to int32 --- src/transformers/models/clip/modeling_clip.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index a1052e3970f1fa..123689b6aaa820 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -660,6 +660,7 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) + # casting to torch.int for onnx compatibility: onnx doesn't support int64 yet pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) ] From 82d4a1bde8d14c1b9f425f1aff9408c1ff5b0a3c Mon Sep 17 00:00:00 2001 From: Dhruv Date: Tue, 9 Aug 2022 16:46:04 +0530 Subject: [PATCH 14/19] comment fix --- src/transformers/models/clip/modeling_clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 123689b6aaa820..77398695d6439a 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -660,7 +660,7 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - # casting to torch.int for onnx compatibility: onnx doesn't support int64 yet + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) ] From 1b30df9baa63c186e7d8c5d7cb8e35864b159fd9 Mon Sep 17 00:00:00 2001 From: Dhruv Date: Wed, 10 Aug 2022 10:27:14 +0530 Subject: [PATCH 15/19] make .T as .t() for onnx conversion --- src/transformers/models/clip/modeling_clip.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 77398695d6439a..687de20ccc0160 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -68,7 +68,8 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -1053,7 +1054,8 @@ def forward( # cosine similarity as logits logit_scale = self.logit_scale.exp() logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale - logits_per_image = logits_per_text.T + # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet + logits_per_image = logits_per_text.t() loss = None if return_loss: From 933b12ef0d9f972c020c61654bf5e8cc6f93ebdc Mon Sep 17 00:00:00 2001 From: Dhruv Date: Wed, 10 Aug 2022 15:25:21 +0530 Subject: [PATCH 16/19] ran make fix-copies --- src/transformers/models/groupvit/modeling_groupvit.py | 8 ++++++-- src/transformers/models/owlvit/modeling_owlvit.py | 3 ++- .../modeling_vision_text_dual_encoder.py | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 1073d4bfea8708..efcb4beefcf56e 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -72,7 +72,8 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->groupvit def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -1132,7 +1133,10 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) + ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 35ebd16cf25bd8..d282aa9dd8f748 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -71,7 +71,8 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 66340deaf4927f..b86e379172bfa0 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -154,7 +154,8 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 From 0f7a95aba3917956f405dfd1a1b6d767480c4067 Mon Sep 17 00:00:00 2001 From: Dhruv Karan Date: Wed, 10 Aug 2022 23:32:30 +0530 Subject: [PATCH 17/19] remove unneeded comment Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- src/transformers/models/clip/modeling_clip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 687de20ccc0160..92ac976ef07923 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -68,7 +68,6 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 From 30289085e4ae40c4bed1ef60ef81f23b75646efb Mon Sep 17 00:00:00 2001 From: Dhruv Date: Wed, 10 Aug 2022 23:33:16 +0530 Subject: [PATCH 18/19] fix copies --- src/transformers/models/groupvit/modeling_groupvit.py | 1 - src/transformers/models/owlvit/modeling_owlvit.py | 1 - .../modeling_vision_text_dual_encoder.py | 1 - 3 files changed, 3 deletions(-) diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index efcb4beefcf56e..9817065ab37a55 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -72,7 +72,6 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->groupvit def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index d282aa9dd8f748..73ee2597f1b163 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -71,7 +71,6 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index b86e379172bfa0..64fd2f405d5084 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -154,7 +154,6 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 From 7663f294385ed59040f858053f6042e87c72290a Mon Sep 17 00:00:00 2001 From: Dhruv Date: Wed, 10 Aug 2022 23:35:23 +0530 Subject: [PATCH 19/19] remove comment --- src/transformers/models/clip/modeling_clip.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 92ac976ef07923..799d0ef0462afc 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -1053,7 +1053,6 @@ def forward( # cosine similarity as logits logit_scale = self.logit_scale.exp() logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale - # .T doesn't work while converting to onnx, aten::numpy_T operator is not supported yet logits_per_image = logits_per_text.t() loss = None