diff --git a/docs/source/en/serialization.mdx b/docs/source/en/serialization.mdx index 9561bbd8ec77c1..0aacdf76f7ef0f 100644 --- a/docs/source/en/serialization.mdx +++ b/docs/source/en/serialization.mdx @@ -55,6 +55,7 @@ Ready-made configurations include the following architectures: - BlenderbotSmall - BLOOM - CamemBERT +- CLIP - CodeGen - ConvBERT - ConvNeXT diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index 6a6e64c995d385..932130f8d5fdf9 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -29,7 +29,13 @@ _import_structure = { - "configuration_clip": ["CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", "CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"], + "configuration_clip": [ + "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP", + "CLIPConfig", + "CLIPOnnxConfig", + "CLIPTextConfig", + "CLIPVisionConfig", + ], "tokenization_clip": ["CLIPTokenizer"], } @@ -95,7 +101,13 @@ if TYPE_CHECKING: - from .configuration_clip import CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, CLIPConfig, CLIPTextConfig, CLIPVisionConfig + from .configuration_clip import ( + CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP, + CLIPConfig, + CLIPOnnxConfig, + CLIPTextConfig, + CLIPVisionConfig, + ) from .tokenization_clip import CLIPTokenizer try: diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 3bb22b74a4c77c..a118b179e4c09f 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -16,9 +16,16 @@ import copy import os -from typing import Union +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, Mapping, Optional, Union + + +if TYPE_CHECKING: + from ...processing_utils import ProcessorMixin + from ...utils import TensorType from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig from ...utils import logging @@ -317,3 +324,44 @@ def to_dict(self): output["vision_config"] = self.vision_config.to_dict() output["model_type"] = self.__class__.model_type return output + + +class CLIPOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("input_ids", {0: "batch", 1: "sequence"}), + ("pixel_values", {0: "batch"}), + ("attention_mask", {0: "batch", 1: "sequence"}), + ] + ) + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("logits_per_image", {0: "batch"}), + ("logits_per_text", {0: "batch"}), + ("text_embeds", {0: "batch"}), + ("image_embeds", {0: "batch"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + def generate_dummy_inputs( + self, + processor: "ProcessorMixin", + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + + text_input_dict = super().generate_dummy_inputs(processor.tokenizer, framework=framework) + image_input_dict = super().generate_dummy_inputs(processor.feature_extractor, framework=framework) + return {**text_input_dict, **image_input_dict} + + @property + def default_onnx_opset(self) -> int: + return 14 diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ddc2236371c29a..799d0ef0462afc 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -68,7 +68,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -660,7 +660,10 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) + ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] @@ -1050,7 +1053,7 @@ def forward( # cosine similarity as logits logit_scale = self.logit_scale.exp() logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale - logits_per_image = logits_per_text.T + logits_per_image = logits_per_text.t() loss = None if return_loss: diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index 1073d4bfea8708..9817065ab37a55 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -72,7 +72,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->groupvit def groupvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @@ -1132,7 +1132,10 @@ def forward( # text_embeds.shape = [batch_size, sequence_length, transformer.width] # take features from the eot embedding (eot_token is the highest number in each sequence) - pooled_output = last_hidden_state[torch.arange(last_hidden_state.shape[0]), input_ids.argmax(dim=-1)] + # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 + pooled_output = last_hidden_state[ + torch.arange(last_hidden_state.shape[0]), input_ids.to(torch.int).argmax(dim=-1) + ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index 35ebd16cf25bd8..73ee2597f1b163 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -71,7 +71,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlvit def owlvit_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py index 66340deaf4927f..64fd2f405d5084 100755 --- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py @@ -154,7 +154,7 @@ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: # Copied from transformers.models.clip.modeling_clip.clip_loss def clip_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) - image_loss = contrastive_loss(similarity.T) + image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py index 8d8b8190e46819..fbfeb47250e73f 100644 --- a/src/transformers/onnx/features.py +++ b/src/transformers/onnx/features.py @@ -201,6 +201,10 @@ class FeaturesManager: "question-answering", onnx_config_cls="models.camembert.CamembertOnnxConfig", ), + "clip": supported_features_mapping( + "default", + onnx_config_cls="models.clip.CLIPOnnxConfig", + ), "codegen": supported_features_mapping( "default", "causal-lm", diff --git a/tests/onnx/test_onnx_v2.py b/tests/onnx/test_onnx_v2.py index 98ab0fad131e01..5634abc7706856 100644 --- a/tests/onnx/test_onnx_v2.py +++ b/tests/onnx/test_onnx_v2.py @@ -185,6 +185,7 @@ def test_values_override(self): ("big-bird", "google/bigbird-roberta-base"), ("ibert", "kssteven/ibert-roberta-base"), ("camembert", "camembert-base"), + ("clip", "openai/clip-vit-base-patch32"), ("convbert", "YituTech/conv-bert-base"), ("codegen", "Salesforce/codegen-350M-multi"), ("deberta", "microsoft/deberta-base"),