From 886ada87ebd7e6f614bfdc70fa5f89521701b776 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 22 Jul 2024 21:55:44 +0800
Subject: [PATCH 01/19] wip

---
 swift/llm/utils/dataset.py  | 14 ++++++--
 swift/llm/utils/template.py | 64 ++++++++++++++++++++++++++++++++++---
 2 files changed, 72 insertions(+), 6 deletions(-)
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 5d731c9e2..47bf3f973 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -1203,7 +1203,12 @@ def preprocess(row):
             bbox[i] = round(float(bbox[i]))
         res = {}
 
-        objects = [[caption, bbox]]
+        objects = [{
+            'caption': caption,
+            'bbox': bbox,
+            'bbox_type': 'real',
+            'image': 0,
+        }]
         media_tag(res, [image_path])
         res['images'] = [image_path]
         res['objects'] = json.dumps(objects, ensure_ascii=False)
@@ -1248,7 +1253,12 @@ def preprocess(row):
             bbox[i] = round(float(bbox[i]))
         res = {}
 
-        objects = [[caption, bbox]]
+        objects = [{
+            'caption': caption,
+            'bbox': bbox,
+            'bbox_type': 'real',
+            'image': 0,
+        }]
         media_tag(res, [image_path])
         res['images'] = [image_path]
         res['objects'] = json.dumps(objects, ensure_ascii=False)
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 28200d0c4..76a44acab 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -487,6 +487,47 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         else:
             return ['<bbox>']
 
+    @classmethod
+    def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', 'norm_1']):
+        if not objects or not images:
+            return
+
+        for object in objects:
+            bbox = object['bbox']
+            bbox_type = object['bbox_type']
+            idx = object['image']
+            image = images[idx]
+            if bbox_type == 'real':
+                if to_type == 'real':
+                    continue
+                width, height = image.width, image.height
+                object['bbox'] = [
+                    int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim for coord, dim in
+                    zip(bbox, [width, height, width, height])
+                ]
+            elif bbox_type == 'norm_1000':
+                if to_type == 'norm_1000':
+                    continue
+                if to_type == 'norm_1':
+                    object['bbox'] = [coord/999. for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [
+                        int(coord / 999. * dim) for coord, dim in
+                        zip(bbox, [width, height, width, height])
+                    ]
+            elif bbox_type == 'norm_1':
+                if to_type == 'norm_1':
+                    continue
+                if to_type == 'norm_1000':
+                    object['bbox'] = [int(coord * 999) for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [
+                        int(coord * dim) for coord, dim in
+                        zip(bbox, [width, height, width, height])
+                    ]
+
     def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
                      **kwargs) -> Tuple[List[Context], List[float]]:
         # replace tag/object/box
@@ -1452,6 +1493,22 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
                 context_list.append('\n')
             return context_list
 
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [f'<ref>{object_}</ref>']
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [f'<box> [[{object_[1][0]}, {object_[1][1]}, {object_[1][2]}, {object_[1][3]}]] </box>']
+        else:
+            return ['<bbox>']
+
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
         history = example.pop('history', None)
@@ -1477,6 +1534,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         from .vision_utils import load_image, load_video
         pixel_values_images = _read_batch(images_path, load_image)
         videos_path = [path for path in videos_path if path is not None]
+        self.normalize_bbox(example.get('objects'), pixel_values_images, to_type='norm_1000')
         if pixel_values_images:
             pixel_values = pixel_values_images
             assert len(pixel_values) == len(idx_list)
@@ -1607,10 +1665,7 @@ def __init__(self):
         }
 
     def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
-        width, height = example['_image'].width, example['_image'].height
-        x1, y1, x2, y2 = [
-            int(coord / dim * 999) for coord, dim in zip(example['objects'][index][1], [width, height, width, height])
-        ]
+        x1, y1, x2, y2 = example['objects'][index][1]
         return [f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>']
 
     def _construct_prompts(self, text):
@@ -1641,6 +1696,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
 
         images = _load_image(images_path[0])
         example['_image'] = images
+        self.normalize_bbox(example.get('objects'), images, to_type='norm_1000')
 
         # process bbox
         if example.get('objects') is not None:

From 8b0fa4c2984d6d447eed5333144c2492d4b64767 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Mon, 22 Jul 2024 23:15:50 +0800
Subject: [PATCH 02/19] fix

---
 swift/llm/utils/template.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 76a44acab..e3daa6165 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1435,7 +1435,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         idx_list = _findall(input_ids, -100)
         labels = inputs.get('labels')
         images_path = example.get('images') or []
-        if isinstance(images_path, str):
+        if not isinstance(images_path, (list, tuple)):
             images_path = [images_path]
         from .vision_utils import load_image
         pixel_values = _read_batch(images_path, load_image)
@@ -1527,9 +1527,9 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         idx_list = _findall(input_ids, -100)
         labels = inputs.get('labels')
         videos_path = example.get('videos') or []
-        if isinstance(images_path, str):
+        if not isinstance(images_path, (list, tuple)):
             images_path = [images_path]
-        if isinstance(videos_path, str):
+        if not isinstance(images_path, (list, tuple)):
             videos_path = [videos_path]
         from .vision_utils import load_image, load_video
         pixel_values_images = _read_batch(images_path, load_image)

From 79709a7936f77406c8046dd3093d588aca22ce32 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Mon, 22 Jul 2024 23:44:50 +0800
Subject: [PATCH 03/19] fix

---
 swift/llm/utils/template.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index e3daa6165..faa7a8b9a 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1520,12 +1520,6 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             images.extend(images_path)
             example['images'] = images
         images_path = example.get('images') or []
-        inputs, _ = super(InternvlTemplate, self).encode(example)
-        if len(inputs) == 0:
-            return inputs, {}
-        input_ids = inputs['input_ids']
-        idx_list = _findall(input_ids, -100)
-        labels = inputs.get('labels')
         videos_path = example.get('videos') or []
         if not isinstance(images_path, (list, tuple)):
             images_path = [images_path]
@@ -1534,7 +1528,15 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         from .vision_utils import load_image, load_video
         pixel_values_images = _read_batch(images_path, load_image)
         videos_path = [path for path in videos_path if path is not None]
+        if example.get('objects'):
+            example['objects'] = json.loads(example.get('objects'))
         self.normalize_bbox(example.get('objects'), pixel_values_images, to_type='norm_1000')
+        inputs, _ = super(InternvlTemplate, self).encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        labels = inputs.get('labels')
         if pixel_values_images:
             pixel_values = pixel_values_images
             assert len(pixel_values) == len(idx_list)

From dd5fdb295ba4f73c7d876011c1be0c29229c178d Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 10:18:52 +0800
Subject: [PATCH 04/19] wip

---
 swift/llm/utils/template.py     | 202 ++++++++++++++++++--------------
 swift/llm/utils/vision_utils.py |   6 +-
 2 files changed, 118 insertions(+), 90 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index faa7a8b9a..3215cd5d5 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -14,6 +14,7 @@
 from transformers import PreTrainedTokenizerBase, StoppingCriteria
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
+from swift.llm import MediaTag
 from swift.llm.agent.utils import calculate_loss_scale, get_tools_prompt
 from swift.torchacc_utils import pad_and_split_batch
 from swift.utils import get_dist_setting, upper_bound, use_torchacc
@@ -164,6 +165,7 @@ class Template:
 
     special_tokens = ['<image>', '<video_label>', '<audio_label>', '<bbox>', '<ref-object>']
     special_keys = ['images', 'videos', 'audios', 'objects']
+    grounding_type = 'norm_1000'
 
     def __init__(self,
                  prefix: Prompt,
@@ -318,33 +320,49 @@ def _prepare_vllm_images(self, images: List['PIL.Image.Image']) -> List['PIL.Ima
 
         return new_images
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        """return: inputs, tokenizer_kwargs"""
+    def preprocess(self, example):
+        # Duplicate example and create a new one to prepare in-place changes
+        example = example.copy()
+        template_type: Optional[str] = getattr(self, 'template_type', None)
+        tools: Union[List[Any], str] = example.get('tools') or []
+
+        # Template needs to be initialized
         if not self._is_init:
             raise ValueError(
                 'Template is not initialized, please use the `get_template` function to obtain the template.')
-        if example.get('images') and not isinstance(example['images'], (tuple, list)):
-            # change images field to list
-            example['images'] = [example['images']]
-        example = example.copy()
+
+        # Check whether this template supports multi-round
+        history: History = example.get('history') or []
+        if len(history) > 0:
+            assert self.support_multi_round, (
+                f'The template does not support multi-round chat, template_type: {template_type}')
+
+        # Format media_keys to list
+        for media_key in MediaTag.media_keys.values():
+            if example.get(media_key) and not isinstance(example[media_key], (tuple, list)):
+                # change images field to list
+                example[media_key] = [example[media_key]]
+
+        # Parse <img></img> format images and merged into images key
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<image>')
+        if images_path:
+            images = example.get('images', [])
+            images.extend(images_path)
+            example['images'] = images
+
+        # Add default tags to examples to note where to put the medias into the sequence
         self.add_default_tags(example)
+
+        # Check the example that whether matching the very template's rules
         self.check_example(example)
+
+        # Format objects(groundings/refs) to json
         if example.get('objects') and isinstance(example['objects'], str):
             # reload grounding from str
             example['objects'] = json.loads(example['objects'])
-        query: str = example.get('query') or ''
-        query_role: str = example.get('query_role') or 'user'
-        response: Optional[str] = example.get('response')
-        history: History = example.get('history') or []
-        history_roles: Optional[History] = example.get('history_roles')
-        system: Optional[str] = example.get('system', None)
-        template_type: Optional[str] = getattr(self, 'template_type', None)
-        tools: Union[List[Any], str] = example.get('tools') or []
-        is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
 
-        if len(history) > 0:
-            assert self.support_multi_round, (
-                f'The template does not support multi-round chat, template_type: {template_type}')
+        # Reset system (by default value and agent tools)
+        system: Optional[str] = example.get('system', None)
         if system is None:
             if self.use_default_system:
                 system = self.default_system
@@ -359,10 +377,36 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             if system is None:
                 system = ''
             system += get_tools_prompt(tools, self.tools_prompt)
+
+        example['system'] = system
+
+        # Set history_roles
+        history_roles: Optional[History] = example.get('history_roles')
         if history_roles is None:
-            history_roles = [['user', 'assistant'] for _ in range(len(history))]
+            example['history_roles'] = [['user', 'assistant'] for _ in range(len(history))]
+
+        # Load image into PIL format
+        from .vision_utils import load_image, load_video
+        if example.get('images'):
+            example['images'] = [load_image(img) for img in example['images']]
+            # Normalize grounding bboxes
+            self.normalize_bbox(example.get('objects'), example.get('images'), to_type=self.grounding_type)
+
+    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        example = self.preprocess(example)
+        return self._encode(example)
 
-        inputs, tokenizer_kwargs = self._encode(
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """return: inputs, tokenizer_kwargs"""
+        query: str = example.get('query') or ''
+        query_role: str = example.get('query_role') or 'user'
+        response: Optional[str] = example.get('response')
+        history: History = example.get('history') or []
+        history_roles: Optional[History] = example.get('history_roles')
+        system: Optional[str] = example.get('system', None)
+        is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
+
+        inputs, tokenizer_kwargs = self._concat_and_tokenize(
             query,
             query_role,
             response,
@@ -509,7 +553,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                 if to_type == 'norm_1000':
                     continue
                 if to_type == 'norm_1':
-                    object['bbox'] = [coord/999. for coord in bbox]
+                    object['bbox'] = [coord / 999. for coord in bbox]
                 elif to_type == 'real':
                     width, height = image.width, image.height
                     object['bbox'] = [
@@ -581,16 +625,16 @@ def _encode_context_list(self, context_list: List[Context],
             loss_scale.extend([loss_weight] * len(token_list))
         return input_ids, labels, loss_scale, tokenizer_kwargs
 
-    def _encode(self,
-                query: str,
-                query_role: str,
-                response: Optional[str],
-                history: History,
-                history_roles: History,
-                system: Optional[str],
-                truncation_strategy: str,
-                auto_add_bos: bool = False,
-                **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _concat_and_tokenize(self,
+                             query: str,
+                             query_role: str,
+                             response: Optional[str],
+                             history: History,
+                             history_roles: History,
+                             system: Optional[str],
+                             truncation_strategy: str,
+                             auto_add_bos: bool = False,
+                             **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
         return: inputs, tokenizer_kwargs
         """
@@ -710,7 +754,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
                 labels[0] = F.pad(labels[0], (0, padding_len) if padding_right else (padding_len, 0), 'constant', -100)
                 if loss_scale:
                     loss_scale[0] = F.pad(loss_scale[0], (0, padding_to - labels[0].shape[-1]) if padding_right else
-                                          (padding_to - labels[0].shape[-1], 0), 'constant', 0.)
+                    (padding_to - labels[0].shape[-1], 0), 'constant', 0.)
 
         if input_ids is None:
             inputs_embeds = self.pad_sequence(inputs_embeds, 0, self.padding_side)
@@ -797,15 +841,15 @@ def _get_safe_print_idx(cls, response: str, print_idx: int, is_finished: bool =
         return print_idx
 
     def generate_ids_to_response(
-        self,
-        generate_ids: List[int],
-        is_finished: bool = True,
-        *,
-        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
-        # only stream=True
-        return_delta: bool = False,
-        print_idx: Optional[List[int]] = None,
-        first_num_space: Optional[List[int]] = None,
+            self,
+            generate_ids: List[int],
+            is_finished: bool = True,
+            *,
+            tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+            # only stream=True
+            return_delta: bool = False,
+            print_idx: Optional[List[int]] = None,
+            first_num_space: Optional[List[int]] = None,
     ):
         if tokenizer_kwargs is None:
             tokenizer_kwargs = {}
@@ -1048,7 +1092,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
         return [[-200], '\n']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1107,7 +1151,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         assert media_type == 'image'
         return [[-100]]
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from .utils import history_to_messages
 
         inputs, _ = super().encode(example)
@@ -1291,7 +1335,7 @@ def __init__(self):
         system_prefix = ['<s>[UNUSED_TOKEN_146]system\n{{SYSTEM}}[UNUSED_TOKEN_145]\n']
         super().__init__(prefix, prompt, chat_sep, suffix, self.INTERNLM_XCOMPOSER_SYSTEM, system_prefix)
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
         history = example.pop('history', None)
         if history is None:
@@ -1427,7 +1471,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
         return [[-100]]
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1472,7 +1516,6 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 
 class Internvl2Template(InternvlTemplate):
-
     video_segments = 8
 
     def __init__(self):
@@ -1509,28 +1552,9 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         else:
             return ['<bbox>']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
-        history = example.pop('history', None)
-        if history is None:
-            history = []
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<image>')
-        if images_path:
-            images = example.get('images') or []
-            images.extend(images_path)
-            example['images'] = images
-        images_path = example.get('images') or []
-        videos_path = example.get('videos') or []
-        if not isinstance(images_path, (list, tuple)):
-            images_path = [images_path]
-        if not isinstance(images_path, (list, tuple)):
-            videos_path = [videos_path]
-        from .vision_utils import load_image, load_video
-        pixel_values_images = _read_batch(images_path, load_image)
-        videos_path = [path for path in videos_path if path is not None]
-        if example.get('objects'):
-            example['objects'] = json.loads(example.get('objects'))
-        self.normalize_bbox(example.get('objects'), pixel_values_images, to_type='norm_1000')
+        history = example.pop('history', [])
         inputs, _ = super(InternvlTemplate, self).encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1689,7 +1713,7 @@ def _construct_prompts(self, text):
             prompts.append(_text)
         return prompts
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
         # read image
         processor = self.tokenizer.processor
@@ -1810,7 +1834,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         else:
             return ['<image>\n']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1837,7 +1861,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         else:
             return ['<video>\n']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1903,7 +1927,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         assert media_type == 'image'
         return [[-200], '\n']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -1945,7 +1969,7 @@ def __init__(self):
         super().__init__(['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s>'], ['</s>'],
                          system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
@@ -1961,7 +1985,7 @@ def __init__(self):
                          self.system,
                          system_prefix=['<s>{{SYSTEM}} '])
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
@@ -1982,7 +2006,7 @@ def __init__(self):
                          ['<|im_end|>'],
                          system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
@@ -2003,7 +2027,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
     def __init__(self):
         Template.__init__(self, [], [self.llavallama_query_template], ['<|eot_id|>'], ['<|eot_id|>'])
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2036,7 +2060,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         assert media_type == 'image'
         return ['<image>' * self.tokenizer.processor.image_seq_length]
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2075,7 +2099,7 @@ def __init__(self):
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         return ['<|image|>']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
         history = example.pop('history', None)
         if history is None:
@@ -2182,7 +2206,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         assert media_type == 'image'
         return ['<image_placeholder>']
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = example.copy()
         history = example.pop('history', None)
         if history is None:
@@ -2264,7 +2288,7 @@ def check_example(self, example):
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
         return []
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2361,7 +2385,7 @@ def check_example(self, example):
         videos = example.get('videos') or []
         assert len(videos) <= 1
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super(CogTemplate, self).encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2422,7 +2446,7 @@ def check_example(self, example):
         images = example.get('images') or []
         assert len(images) == 1
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2563,7 +2587,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         assert media_type == 'image'
         return [[-200]]
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from mplug_owl2.mm_utils import process_images
         processor = self.tokenizer.processor
         images_path = example.get('images') or []
@@ -2617,13 +2641,13 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 
 def get_template(
-    template_type: str,
-    tokenizer: PreTrainedTokenizerBase,
-    default_system: Optional[str] = None,
-    max_length: Optional[int] = None,
-    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-    model=None,
-    **kwargs,
+        template_type: str,
+        tokenizer: PreTrainedTokenizerBase,
+        default_system: Optional[str] = None,
+        max_length: Optional[int] = None,
+        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+        model=None,
+        **kwargs,
 ) -> Template:
     template_info = TEMPLATE_MAPPING[template_type]
     template = deepcopy(template_info['template'])
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 1d0b8e876..f3e8707fe 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -75,7 +75,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
     return processed_images
 
 
-def load_image(img_path, input_size=448, max_num=6):
+def load_image(img_path):
     if isinstance(img_path, str):
         img_path = img_path.strip()
         if img_path.startswith('http'):
@@ -93,6 +93,10 @@ def load_image(img_path, input_size=448, max_num=6):
         image = img_path
     if image.mode != 'RGB':
         image = image.convert('RGB')
+    return image
+
+
+def transform_image(image, input_size=448, max_num=6):
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
     pixel_values = [transform(image) for image in images]

From 61c2bd9979064b67b7d669cc9b88f50cd97e8f91 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 14:27:08 +0800
Subject: [PATCH 05/19] wip

---
 swift/llm/utils/template.py     | 130 +++++++++-----------------------
 swift/llm/utils/vision_utils.py |  20 ++++-
 2 files changed, 55 insertions(+), 95 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 5e94c8144..edfac7ee6 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -166,6 +166,7 @@ class Template:
     special_tokens = ['<image>', '<video_label>', '<audio_label>', '<bbox>', '<ref-object>']
     special_keys = ['images', 'videos', 'audios', 'objects']
     grounding_type = 'norm_1000'
+    image_placeholder = '<image>'
 
     def __init__(self,
                  prefix: Prompt,
@@ -344,7 +345,7 @@ def preprocess(self, example):
                 example[media_key] = [example[media_key]]
 
         # Parse <img></img> format images and merged into images key
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<image>')
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, self.image_placeholder)
         if images_path:
             images = example.get('images', [])
             images.extend(images_path)
@@ -1049,31 +1050,6 @@ class QwenAudioGenerationTemplate(_QwenAudioTemplateMixin, DefaultGenerationTemp
     '仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。')
 
 
-def _load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
-    from PIL import Image, UnidentifiedImageError
-    import os
-    import base64
-    import binascii
-    if isinstance(img_path, str):
-        img_path = img_path.strip()
-        if img_path.startswith('http'):
-            content = requests.get(img_path).content
-            image = Image.open(BytesIO(content))
-        elif os.path.exists(img_path):
-            image = Image.open(img_path)
-        else:  # base64_str
-            try:
-                image_data = base64.b64decode(img_path)
-                image = Image.open(BytesIO(image_data))
-            except (binascii.Error, UnidentifiedImageError) as error:
-                raise ValueError(f'invalid image: {error}')
-    else:
-        image = img_path
-    if image.mode != 'RGB':
-        image = image.convert('RGB')
-    return image
-
-
 def _load_video_llava(video_path: str) -> np.ndarray:
     import av
     container = av.open(video_path)
@@ -1094,16 +1070,6 @@ def _load_video_llava(video_path: str) -> np.ndarray:
 _T = TypeVar('_T')
 
 
-def _read_batch(path_list: List[Union[str, 'PIL.Image.Image', None]],
-                load_func: Callable[[str], _T] = _load_image) -> List[_T]:
-    res = []
-    for path in path_list:
-        if path is None:  # ignore None
-            continue
-        res.append(load_func(path))
-    return res
-
-
 class YiVLTemplate(Template):
 
     def replace_tag(self, media_type, index, example) -> List[Context]:
@@ -1120,8 +1086,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         if not hasattr(model, 'vision_tower'):
             model = model.model
         image_processor = model.vision_tower.image_processor
-        images_path = example.get('images') or []
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         for i, image in enumerate(images):
             background_color = tuple(int(x * 255) for x in image_processor.image_mean)
             image = expand2square(image, background_color)
@@ -1180,8 +1145,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         idx_list = _findall(input_ids, -100)
         if idx_list:
             idx = idx_list[0]
-            images_path = example.get('images') or []
-            image = _read_batch(images_path)[0]
+            image = example.get('images', [])
             placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
             placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
             input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
@@ -1344,6 +1308,7 @@ class InternLMXComposer2Template(Template):
         '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen '
         'by the user such as English and 中文.')
     is_v2_5 = False
+    image_placeholder = '</s>'
 
     def __init__(self):
         prefix = ['<s>']
@@ -1354,17 +1319,11 @@ def __init__(self):
         super().__init__(prefix, prompt, chat_sep, suffix, self.INTERNLM_XCOMPOSER_SYSTEM, system_prefix)
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        example = example.copy()
-        history = example.pop('history', None)
-        if history is None:
-            history = []
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '</s>')
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
         dtype = self.model.dtype
-        images_path.extend(example.get('images') or [])
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         if self.is_v2_5:
             hd_num = 24
             Image_transform = get_class_from_dynamic_module('ixc_utils.Image_transform', self.tokenizer.model_dir)
@@ -1496,11 +1455,8 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         input_ids = inputs['input_ids']
         idx_list = _findall(input_ids, -100)
         labels = inputs.get('labels')
-        images_path = example.get('images') or []
-        if not isinstance(images_path, (list, tuple)):
-            images_path = [images_path]
-        from .vision_utils import load_image
-        pixel_values = _read_batch(images_path, load_image)
+        from .vision_utils import transform_image
+        pixel_values = [transform_image(image) for image in example.get('images', [])]
         if pixel_values:
             pixel_values = torch.cat(pixel_values, dim=0)
             image_bs = pixel_values.shape[0]
@@ -1571,14 +1527,15 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
             return ['<bbox>']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        example = example.copy()
-        history = example.pop('history', [])
         inputs, _ = super(InternvlTemplate, self).encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
         idx_list = _findall(input_ids, -100)
         labels = inputs.get('labels')
+        from swift.llm.utils.vision_utils import transform_image
+        pixel_values_images = [transform_image(image) for image in example.get('images', [])]
+        videos_path = example.get('videos_path', [])
         if pixel_values_images:
             pixel_values = pixel_values_images
             assert len(pixel_values) == len(idx_list)
@@ -1600,6 +1557,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
             inputs['image_flags'] = torch.ones(patches)
         elif videos_path:
             assert len(videos_path) == 1
+            from swift.llm.utils.vision_utils import load_video
             pixel_values, num_patches = load_video(videos_path[0], num_segments=self.video_segments)
             assert len(num_patches) == len(idx_list)
             added_tokens_len = 0
@@ -1732,15 +1690,12 @@ def _construct_prompts(self, text):
         return prompts
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        example = example.copy()
-        # read image
         processor = self.tokenizer.processor
-        images_path = example.get('images') or []
-        assert len(images_path) == 1, 'Florence series models only supports input with a single image.'
-
-        images = _load_image(images_path[0])
+        images = example.get('images', [])
+        assert len(images) == 1, 'Florence series models only supports input with a single image.'
+        from .vision_utils import transform_image
+        images = transform_image(images[0])
         example['_image'] = images
-        self.normalize_bbox(example.get('objects'), images, to_type='norm_1000')
 
         # process bbox
         if example.get('objects') is not None:
@@ -1856,8 +1811,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path = example.get('images') or []
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         image_processor = self.tokenizer.processor.image_processor
         if self._is_vllm:
             images = self._prepare_vllm_images(images)
@@ -1883,7 +1837,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        media_files = example.get('videos') or []
+        media_files = example.get('videos', [])
         images_path, videos_path = [], []
         for media_file in media_files:
             if media_file is None:
@@ -1893,11 +1847,13 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
             else:
                 videos_path.append(media_file)
         if len(videos_path) > 0:
+            from swift.llm.utils.vision_utils import _read_batch
             videos = _read_batch(videos_path, _load_video_llava)
             video_processor = self.tokenizer.processor.video_processor
             video_inputs = video_processor(videos, return_tensors='pt').to(self.model.dtype)
             inputs['pixel_values_videos'] = video_inputs['pixel_values_videos']
         if len(images_path) > 0:
+            from swift.llm.utils.vision_utils import _read_batch
             images = _read_batch(images_path)
             image_processor = self.tokenizer.processor.image_processor
             image_inputs = image_processor(images, return_tensors='pt').to(self.model.dtype)
@@ -1949,8 +1905,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path = example.get('images') or []
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         image_sizes = [x.size for x in images]
         from llava.mm_utils import process_images
         model = self.model.model
@@ -2049,8 +2004,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        image_path = example.get('images') or []
-        raw_image = _read_batch(image_path)
+        raw_image = example.get('images', [])
         if raw_image:
             pixel_values = self.tokenizer.processor.image_processor(raw_image[0], return_tensors='pt')['pixel_values']
             inputs['pixel_values'] = pixel_values.to(self.model.dtype)
@@ -2082,7 +2036,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        image_path = example.get('images') or []
+        raw_image = example.get('images', [])
         processor = self.tokenizer.processor
         if inputs['labels'] is not None:
             n = upper_bound(0, len(inputs['labels']), lambda idx: inputs['labels'][idx] == -100)
@@ -2090,7 +2044,6 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
             inputs['token_type_ids'] = [0] * n + [1] * n2
         else:
             inputs['token_type_ids'] = [0] * len(inputs['input_ids'])
-        raw_image = _read_batch(image_path)
         if raw_image:
             model_inputs = processor(text=example['query'], images=raw_image[0], return_tensors='pt')
             inputs['pixel_values'] = model_inputs['pixel_values']
@@ -2110,6 +2063,8 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 class Phi3VisionTemplate(Template):
 
+    image_placeholder = '<|image|>'
+
     def __init__(self):
         Template.__init__(self, ['<s>'], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'],
                           None, ['<s><|system|>\n{{SYSTEM}}<|end|>\n'])
@@ -2118,13 +2073,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return ['<|image|>']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        example = example.copy()
-        history = example.pop('history', None)
-        if history is None:
-            history = []
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<|image|>')
-        images_path.extend(example.get('images') or [])
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
@@ -2216,6 +2165,8 @@ class DeepseekVLTemplate(Template):
                           'You are able to understand the visual content that the user provides, '
                           'and assist the user with a variety of tasks using natural language.')
 
+    image_placeholder = '<image_placeholder>'
+
     def __init__(self):
         super().__init__(['<｜begin▁of▁sentence｜>{{SYSTEM}}\n\n'], ['User: {{QUERY}}\n\nAssistant:'],
                          ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'], self.DEEPSEEK_VL_SYSTEM)
@@ -2225,18 +2176,10 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return ['<image_placeholder>']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        example = example.copy()
-        history = example.pop('history', None)
-        if history is None:
-            history = []
-
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
-                                                                            '<image_placeholder>')
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path.extend(example.get('images') or [])
-        images = _read_batch(images_path)
+        images = example.get('images')
         processor = self.tokenizer.processor
         input_ids, labels = inputs['input_ids'], inputs['labels']
         idx_list = _findall(input_ids, processor.image_id)
@@ -2310,8 +2253,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path = example.get('images') or []
-        image = _read_batch(images_path)
+        image = example.get('images', [])
         inputs.pop('loss_scale', None)
         model = self.model
         inputs2 = model.build_conversation_input_ids(
@@ -2407,7 +2349,8 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         inputs, _ = super(CogTemplate, self).encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        videos_path = example.get('videos') or []
+        videos_path = example.get('videos', [])
+        from swift.llm.utils.vision_utils import _read_batch
         video = _read_batch(videos_path, _load_video_cogvlm2)
         inputs.pop('loss_scale', None)
         model = self.model
@@ -2461,15 +2404,15 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return [[-1]]
 
     def check_example(self, example):
-        images = example.get('images') or []
+        images = example.get('images', [])
         assert len(images) == 1
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
         if len(inputs) == 0:
             return inputs, {}
-        images_path = example['images']
-        image = _load_image(images_path[0])
+        images = example['images']
+        image = images[0]
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         idx_list = _findall(input_ids, -1)
@@ -2608,8 +2551,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from mplug_owl2.mm_utils import process_images
         processor = self.tokenizer.processor
-        images_path = example.get('images') or []
-        images = _read_batch(images_path)
+        images = example.get('images', [])
         for i, image in enumerate(images):
             # ref: https://modelscope.cn/models/iic/mPLUG-Owl2.1
             max_edge = max(image.size)
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index f3e8707fe..b91375708 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -2,6 +2,7 @@
 import binascii
 import os
 from io import BytesIO
+from typing import Union, List, Callable, TypeVar
 
 import numpy as np
 import requests
@@ -75,7 +76,11 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
     return processed_images
 
 
-def load_image(img_path):
+def _load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
+    from PIL import Image, UnidentifiedImageError
+    import os
+    import base64
+    import binascii
     if isinstance(img_path, str):
         img_path = img_path.strip()
         if img_path.startswith('http'):
@@ -96,6 +101,19 @@ def load_image(img_path):
     return image
 
 
+_T = TypeVar('_T')
+
+
+def _read_batch(path_list: List[Union[str, 'PIL.Image.Image', None]],
+                load_func: Callable[[str], _T] = _load_image) -> List[_T]:
+    res = []
+    for path in path_list:
+        if path is None:  # ignore None
+            continue
+        res.append(load_func(path))
+    return res
+
+
 def transform_image(image, input_size=448, max_num=6):
     transform = build_transform(input_size=input_size)
     images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)

From 686b355e17569b3840a26b31f2614b1213124513 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 14:33:48 +0800
Subject: [PATCH 06/19] fix

---
 swift/llm/utils/template.py     | 2 +-
 swift/llm/utils/vision_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index edfac7ee6..667063f85 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -387,7 +387,7 @@ def preprocess(self, example):
             example['history_roles'] = [['user', 'assistant'] for _ in range(len(history))]
 
         # Load image into PIL format
-        from .vision_utils import load_image, load_video
+        from .vision_utils import load_image
         if example.get('images'):
             example['images'] = [load_image(img) for img in example['images']]
             # Normalize grounding bboxes
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index b91375708..531d66963 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -76,7 +76,7 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
     return processed_images
 
 
-def _load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
+def load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
     from PIL import Image, UnidentifiedImageError
     import os
     import base64
@@ -105,7 +105,7 @@ def _load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
 
 
 def _read_batch(path_list: List[Union[str, 'PIL.Image.Image', None]],
-                load_func: Callable[[str], _T] = _load_image) -> List[_T]:
+                load_func: Callable[[str], _T] = load_image) -> List[_T]:
     res = []
     for path in path_list:
         if path is None:  # ignore None

From 7b51f718750a588c18d6f7dfa3d1fbb43f0fbde4 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 15:12:12 +0800
Subject: [PATCH 07/19] fix

---
 swift/llm/utils/template.py | 57 +++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 667063f85..e056c648c 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -14,7 +14,6 @@
 from transformers import PreTrainedTokenizerBase, StoppingCriteria
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
 
-from swift.llm import MediaTag
 from swift.llm.agent.utils import calculate_loss_scale, get_tools_prompt
 from swift.torchacc_utils import pad_and_split_batch
 from swift.utils import get_dist_setting, upper_bound, use_torchacc
@@ -338,6 +337,7 @@ def preprocess(self, example):
             assert self.support_multi_round, (
                 f'The template does not support multi-round chat, template_type: {template_type}')
 
+        from swift.llm import MediaTag
         # Format media_keys to list
         for media_key in MediaTag.media_keys.values():
             if example.get(media_key) and not isinstance(example[media_key], (tuple, list)):
@@ -392,6 +392,7 @@ def preprocess(self, example):
             example['images'] = [load_image(img) for img in example['images']]
             # Normalize grounding bboxes
             self.normalize_bbox(example.get('objects'), example.get('images'), to_type=self.grounding_type)
+        return example
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         example = self.preprocess(example)
@@ -525,7 +526,7 @@ def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
             object_ = objects[index]
-            return [object_[0]]
+            return [object_['caption']]
         else:
             return ['<ref-object>']
 
@@ -533,7 +534,7 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
             object_ = objects[index]
-            return [f'({object_[1][0]},{object_[1][1]}),({object_[1][2]},{object_[1][3]})']
+            return [f'({object_["bbox"][0]},{object_["bbox"][1]}),({object_["bbox"][2]},{object_["bbox"][3]})']
         else:
             return ['<bbox>']
 
@@ -971,12 +972,12 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int
     def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example['objects']
         object_ = objects[index]
-        return [f'<ref>{object_[0]}</ref>']
+        return [f'<ref>{object_["caption"]}</ref>']
 
     def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example['objects']
         object_ = objects[index]
-        return [f'<box>({object_[1][0]},{object_[1][1]}),({object_[1][2]},{object_[1][3]})</box>']
+        return [f'<box>({object_["bbox"][0]},{object_["bbox"][1]}),({object_["bbox"][2]},{object_["bbox"][3]})</box>']
 
 
 register_template(TemplateType.qwen, QwenTemplate())
@@ -991,8 +992,8 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
 
 class _QwenAudioTemplateMixin:
 
-    def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, tokenizer_kwargs = super().encode(example)
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = super()._encode(example)
         if len(inputs) == 0:
             return inputs, tokenizer_kwargs
         inputs.pop('loss_scale', None)
@@ -1077,7 +1078,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         return [[-200], '\n']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         inputs.pop('loss_scale', None)
@@ -1137,7 +1138,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         from .utils import history_to_messages
 
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
@@ -1319,7 +1320,7 @@ def __init__(self):
         super().__init__(prefix, prompt, chat_sep, suffix, self.INTERNLM_XCOMPOSER_SYSTEM, system_prefix)
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         dtype = self.model.dtype
@@ -1449,7 +1450,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         return [[-100]]
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
@@ -1514,7 +1515,7 @@ def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
             object_ = objects[index]
-            return [f'<ref>{object_}</ref>']
+            return [f'<ref>{object_["caption"]}</ref>']
         else:
             return ['<ref-object>']
 
@@ -1522,12 +1523,12 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
             object_ = objects[index]
-            return [f'<box> [[{object_[1][0]}, {object_[1][1]}, {object_[1][2]}, {object_[1][3]}]] </box>']
+            return [f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>']
         else:
             return ['<bbox>']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super(InternvlTemplate, self).encode(example)
+        inputs, _ = super(InternvlTemplate, self)._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
@@ -1808,7 +1809,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
             return ['<image>\n']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         images = example.get('images', [])
@@ -1834,7 +1835,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
             return ['<video>\n']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         media_files = example.get('videos', [])
@@ -1902,7 +1903,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return [[-200], '\n']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         images = example.get('images', [])
@@ -1943,7 +1944,7 @@ def __init__(self):
                          system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
         return inputs, {}
@@ -1959,7 +1960,7 @@ def __init__(self):
                          system_prefix=['<s>{{SYSTEM}} '])
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
         return inputs, {}
@@ -1980,7 +1981,7 @@ def __init__(self):
                          system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if 'pixel_values' in inputs:
             inputs['pixel_values'] = inputs['pixel_values'].squeeze(0)
         return inputs, {}
@@ -2001,7 +2002,7 @@ def __init__(self):
         Template.__init__(self, [], [self.llavallama_query_template], ['<|eot_id|>'], ['<|eot_id|>'])
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         raw_image = example.get('images', [])
@@ -2033,7 +2034,7 @@ def replace_tag(self, media_type, index, example) -> List[Context]:
         return ['<image>' * self.tokenizer.processor.image_seq_length]
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         raw_image = example.get('images', [])
@@ -2074,7 +2075,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         images = example.get('images', [])
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']
@@ -2176,7 +2177,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return ['<image_placeholder>']
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         images = example.get('images')
@@ -2250,7 +2251,7 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
         return []
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         image = example.get('images', [])
@@ -2346,7 +2347,7 @@ def check_example(self, example):
         assert len(videos) <= 1
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super(CogTemplate, self).encode(example)
+        inputs, _ = super(CogTemplate, self)._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         videos_path = example.get('videos', [])
@@ -2408,7 +2409,7 @@ def check_example(self, example):
         assert len(images) == 1
 
     def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         images = example['images']
@@ -2557,7 +2558,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
             max_edge = max(image.size)
             image = image.resize((max_edge, max_edge))
             images[i] = image
-        inputs, _ = super().encode(example)
+        inputs, _ = super()._encode(example)
         if len(inputs) == 0:
             return inputs, {}
         input_ids = inputs['input_ids']

From 79cb4895942d61138fbeef6547d0b0f2af32b725 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 15:26:39 +0800
Subject: [PATCH 08/19] fix

---
 swift/llm/utils/template.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index e056c648c..70843161a 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -361,6 +361,18 @@ def preprocess(self, example):
         if example.get('objects') and isinstance(example['objects'], str):
             # reload grounding from str
             example['objects'] = json.loads(example['objects'])
+            objects = []
+            for object in example['objects']:
+                # Compatible with list format
+                if isinstance(object, list):
+                    object = {
+                        'caption': object[0],
+                        'bbox': object[1],
+                        'bbox_type': None,
+                        'image': 0,
+                    }
+                objects.append(object)
+            example['objects'] = objects
 
         # Reset system (by default value and agent tools)
         system: Optional[str] = example.get('system', None)

From 231793f688e42aede26a7586aa9756b0c437b9ea Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 15:32:33 +0800
Subject: [PATCH 09/19] fix

---
 swift/llm/utils/template.py     | 57 ++++++++++++++++-----------------
 swift/llm/utils/vision_utils.py |  2 +-
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index 70843161a..b2fa5b4d7 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -345,7 +345,8 @@ def preprocess(self, example):
                 example[media_key] = [example[media_key]]
 
         # Parse <img></img> format images and merged into images key
-        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, self.image_placeholder)
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
+                                                                            self.image_placeholder)
         if images_path:
             images = example.get('images', [])
             images.extend(images_path)
@@ -565,8 +566,8 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                     continue
                 width, height = image.width, image.height
                 object['bbox'] = [
-                    int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim for coord, dim in
-                    zip(bbox, [width, height, width, height])
+                    int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
+                    for coord, dim in zip(bbox, [width, height, width, height])
                 ]
             elif bbox_type == 'norm_1000':
                 if to_type == 'norm_1000':
@@ -576,8 +577,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                 elif to_type == 'real':
                     width, height = image.width, image.height
                     object['bbox'] = [
-                        int(coord / 999. * dim) for coord, dim in
-                        zip(bbox, [width, height, width, height])
+                        int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])
                     ]
             elif bbox_type == 'norm_1':
                 if to_type == 'norm_1':
@@ -586,10 +586,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                     object['bbox'] = [int(coord * 999) for coord in bbox]
                 elif to_type == 'real':
                     width, height = image.width, image.height
-                    object['bbox'] = [
-                        int(coord * dim) for coord, dim in
-                        zip(bbox, [width, height, width, height])
-                    ]
+                    object['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])]
 
     def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
                      **kwargs) -> Tuple[List[Context], List[float]]:
@@ -786,7 +783,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
                 labels[0] = F.pad(labels[0], (0, padding_len) if padding_right else (padding_len, 0), 'constant', -100)
                 if loss_scale:
                     loss_scale[0] = F.pad(loss_scale[0], (0, padding_to - labels[0].shape[-1]) if padding_right else
-                    (padding_to - labels[0].shape[-1], 0), 'constant', 0.)
+                                          (padding_to - labels[0].shape[-1], 0), 'constant', 0.)
 
         if input_ids is None:
             inputs_embeds = self.pad_sequence(inputs_embeds, 0, self.padding_side)
@@ -873,15 +870,15 @@ def _get_safe_print_idx(cls, response: str, print_idx: int, is_finished: bool =
         return print_idx
 
     def generate_ids_to_response(
-            self,
-            generate_ids: List[int],
-            is_finished: bool = True,
-            *,
-            tokenizer_kwargs: Optional[Dict[str, Any]] = None,
-            # only stream=True
-            return_delta: bool = False,
-            print_idx: Optional[List[int]] = None,
-            first_num_space: Optional[List[int]] = None,
+        self,
+        generate_ids: List[int],
+        is_finished: bool = True,
+        *,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        # only stream=True
+        return_delta: bool = False,
+        print_idx: Optional[List[int]] = None,
+        first_num_space: Optional[List[int]] = None,
     ):
         if tokenizer_kwargs is None:
             tokenizer_kwargs = {}
@@ -1535,7 +1532,9 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         objects = example.get('objects')
         if objects:
             object_ = objects[index]
-            return [f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>']
+            return [
+                f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
+            ]
         else:
             return ['<bbox>']
 
@@ -1680,7 +1679,7 @@ def __init__(self):
         }
 
     def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
-        x1, y1, x2, y2 = example['objects'][index][1]
+        x1, y1, x2, y2 = example['objects'][index]['bbox']
         return [f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>']
 
     def _construct_prompts(self, text):
@@ -1713,7 +1712,6 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         # process bbox
         if example.get('objects') is not None:
             if '<ref-object>' in example['query']:
-                example['objects'] = json.loads(example['objects'])
                 example['query'] = '<OPEN_VOCABULARY_DETECTION>'
                 example['response'] = ''
                 for idx in range(len(example['objects'])):
@@ -1722,7 +1720,6 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
                     example['query'] += example['objects'][idx][0]
                     example['response'] += example['objects'][idx][0] + self.replace_box(idx, example)[0]
             elif '<bbox>' in example['query']:
-                example['objects'] = json.loads(example['objects'])
                 example['query'] = '<REGION_TO_DESCRIPTION>'
                 example['response'] = ''
                 for idx in range(len(example['objects'])):
@@ -2614,13 +2611,13 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 
 
 def get_template(
-        template_type: str,
-        tokenizer: PreTrainedTokenizerBase,
-        default_system: Optional[str] = None,
-        max_length: Optional[int] = None,
-        truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
-        model=None,
-        **kwargs,
+    template_type: str,
+    tokenizer: PreTrainedTokenizerBase,
+    default_system: Optional[str] = None,
+    max_length: Optional[int] = None,
+    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+    model=None,
+    **kwargs,
 ) -> Template:
     template_info = TEMPLATE_MAPPING[template_type]
     template = deepcopy(template_info['template'])
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 531d66963..c582d4131 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -2,7 +2,7 @@
 import binascii
 import os
 from io import BytesIO
-from typing import Union, List, Callable, TypeVar
+from typing import Callable, List, TypeVar, Union
 
 import numpy as np
 import requests

From f40490d5a26f375cf5b4c4f9d3e14d3011dd7ca6 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 17:14:02 +0800
Subject: [PATCH 10/19] fix

---
 swift/llm/utils/template.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index b2fa5b4d7..b0ee96d6d 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -166,6 +166,7 @@ class Template:
     special_keys = ['images', 'videos', 'audios', 'objects']
     grounding_type = 'norm_1000'
     image_placeholder = '<image>'
+    load_medias = True
 
     def __init__(self,
                  prefix: Prompt,
@@ -402,9 +403,14 @@ def preprocess(self, example):
         # Load image into PIL format
         from .vision_utils import load_image
         if example.get('images'):
-            example['images'] = [load_image(img) for img in example['images']]
-            # Normalize grounding bboxes
-            self.normalize_bbox(example.get('objects'), example.get('images'), to_type=self.grounding_type)
+            images = example['images']
+            if example.get('objects') or self.load_medias:
+                images = [load_image(img) for img in images]
+            if example.get('objects'):
+                # Normalize grounding bboxes
+                self.normalize_bbox(example['objects'], images, to_type=self.grounding_type)
+            if self.load_medias:
+                example['images'] = images
         return example
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
@@ -569,6 +575,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                     int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
                     for coord, dim in zip(bbox, [width, height, width, height])
                 ]
+                object['bbox_type'] = to_type
             elif bbox_type == 'norm_1000':
                 if to_type == 'norm_1000':
                     continue
@@ -579,6 +586,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                     object['bbox'] = [
                         int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])
                     ]
+                object['bbox_type'] = to_type
             elif bbox_type == 'norm_1':
                 if to_type == 'norm_1':
                     continue
@@ -587,6 +595,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
                 elif to_type == 'real':
                     width, height = image.width, image.height
                     object['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])]
+                object['bbox_type'] = to_type
 
     def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
                      **kwargs) -> Tuple[List[Context], List[float]]:
@@ -965,6 +974,8 @@ def __init__(self, auto_add_bos: bool = False):
 
 class QwenVLTemplate(QwenTemplate):
 
+    load_medias = False
+
     def check_example(self, example):
         images = example.get('images') or []
         from swift.llm.utils.utils import fetch_one
@@ -1706,8 +1717,8 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
         images = example.get('images', [])
         assert len(images) == 1, 'Florence series models only supports input with a single image.'
         from .vision_utils import transform_image
-        images = transform_image(images[0])
-        example['_image'] = images
+        image_tensors = transform_image(images[0])
+        example['_image'] = image_tensors
 
         # process bbox
         if example.get('objects') is not None:
@@ -1717,15 +1728,15 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
                 for idx in range(len(example['objects'])):
                     if idx != 0:
                         example['query'] += ','
-                    example['query'] += example['objects'][idx][0]
-                    example['response'] += example['objects'][idx][0] + self.replace_box(idx, example)[0]
+                    example['query'] += example['objects'][idx]['caption']
+                    example['response'] += example['objects'][idx]['caption'] + self.replace_box(idx, example)[0]
             elif '<bbox>' in example['query']:
                 example['query'] = '<REGION_TO_DESCRIPTION>'
                 example['response'] = ''
                 for idx in range(len(example['objects'])):
                     bbox = self.replace_box(idx, example)[0]
                     example['query'] += bbox
-                    example['response'] += example['objects'][idx][0]
+                    example['response'] += example['objects'][idx]['caption']
         example['query'] = self._construct_prompts([example.get('query')])[0]
 
         inputs = processor(text=example['query'], images=images, return_tensors='pt').to(self.model.device)
@@ -1747,7 +1758,7 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
             return {}, {}
         inputs['input_ids'] = inputs['input_ids'][:self.max_length]
         inputs['attention_mask'] = inputs['attention_mask'][:self.max_length]
-        if inputs.get('labels'):
+        if inputs.get('labels') is not None:
             inputs['labels'] = inputs['labels'][:self.max_length]
 
         return inputs, {}

From 317f45294fb6cf27df3b9eaca512feeec9353930 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 19:02:02 +0800
Subject: [PATCH 11/19] fix docs

---
 ...\200\344\275\263\345\256\236\350\267\265.md" |  9 +++++++--
 ...\200\344\275\263\345\256\236\350\267\265.md" | 16 ++++++++++++++++
 .../Multi-Modal/florence-best-pratice.md        | 12 ++++++++++--
 .../Multi-Modal/internvl-best-practice.md       | 17 +++++++++++++++++
 swift/llm/utils/dataset.py                      |  2 +-
 swift/llm/utils/template.py                     |  3 ++-
 swift/llm/utils/vision_utils.py                 |  4 ----
 7 files changed, 53 insertions(+), 10 deletions(-)

diff --git "a/docs/source/Multi-Modal/florence\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/florence\346\234\200\344\275\263\345\256\236\350\267\265.md"
index baf904365..cdd5b7ad9 100644
--- "a/docs/source/Multi-Modal/florence\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/florence\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -164,9 +164,14 @@ CUDA_VISIBLE_DEVICES=0 swift sft \
 1. 对于给定bounding box询问目标的任务, 在query中指定`<bbox>`, 在response中指定`<ref-object>`, 在`objects`提供目标和bounding box具体信息
 2. 对于给定目标询问bounding box的任务,在query中指定`<ref-object>`, 在response中指定`<bbox>`, 在`objects`提供目标和bounding box具体信息
 ```jsonl
-{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[[\"bottom right sandwich\", [331, 266, 612, 530]]]" }
-{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[[\"bottom right sandwich\", [331, 266, 612, 530]]]" }
+{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
 ```
+上述objects字段中包含了一个json string，其中有四个字段：
+    a. caption bbox对应的物体描述
+    b. bbox 坐标 建议给四个整数（而非float型），分别是x_min,y_min,x_max,y_max四个值
+    c. bbox_type: bbox类型 目前支持三种：real/norm_1000/norm_1，分别代表实际像素值坐标/千分位比例坐标/归一化比例坐标
+    d. image: bbox对应的图片是第几张, 索引从0开始
 
 
 ## 微调后推理
diff --git "a/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
index 9e73766e0..8e8fb0634 100644
--- "a/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source/Multi-Modal/internvl\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -362,6 +362,22 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 swift sft \
 {"query": "Describe this video in detail. Don't repeat", "response": "xxxxxxxxx", "history": [], "videos": ["video_path"]}
 ```
 
+**InternVL2**模型支持grounding任务的训练，数据参考下面的格式：
+```jsonl
+{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+```
+上述objects字段中包含了一个json string，其中有四个字段：
+    a. caption bbox对应的物体描述
+    b. bbox 坐标 建议给四个整数（而非float型），分别是x_min,y_min,x_max,y_max四个值
+    c. bbox_type: bbox类型 目前支持三种：real/norm_1000/norm_1，分别代表实际像素值坐标/千分位比例坐标/归一化比例坐标
+    d. image: bbox对应的图片是第几张, 索引从0开始
+上述格式会被转换为InternVL2可识别的格式，具体来说：
+```jsonl
+{"query": "Find <ref>the man</ref>", "response": "<box> [[200, 200, 600, 600]] </box>"}
+```
+也可以直接传入上述格式，但是注意坐标请使用千分位坐标。
+
 ## 微调后推理
 直接推理:
 ```shell
diff --git a/docs/source_en/Multi-Modal/florence-best-pratice.md b/docs/source_en/Multi-Modal/florence-best-pratice.md
index 262d1bf88..f1f4164d7 100644
--- a/docs/source_en/Multi-Modal/florence-best-pratice.md
+++ b/docs/source_en/Multi-Modal/florence-best-pratice.md
@@ -163,9 +163,17 @@ Currently, two types of custom grounding tasks are supported:
 1. For tasks asking about the target for a given bounding box, specify `<bbox>` in the query, `<ref-object>` in the response, and provide the target and bounding box details in objects.
 2. For tasks asking about the bounding box for a given target, specify `<ref-object>` in the query, `<bbox>` in the response, and provide the target and bounding box details in objects.
 ```jsonl
-{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[[\"bottom right sandwich\", [331, 266, 612, 530]]]" }
-{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[[\"bottom right sandwich\", [331, 266, 612, 530]]]" }
+{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
 ```
+The `objects` field contains a JSON string with four fields:
+  1. `caption`: Description of the object corresponding to the bounding box (bbox)
+  2. `bbox`: Coordinates of the bounding box. It is recommended to provide four integers (rather than float values), specifically `x_min`, `y_min`, `x_max`, and `y_max`.
+  3. `bbox_type`: Type of the bounding box. Currently, three types are supported: `real`, `norm_1000`, and `norm_1`, which respectively represent actual pixel value coordinates, thousandth ratio coordinates, and normalized ratio coordinates.
+  4. `image`: The index of the image corresponding to the bounding box. The index starts from 0.
+
+
+Let me know if you need further assistance!
 
 ## Inference after Fine-tuning
 Direct inference:
diff --git a/docs/source_en/Multi-Modal/internvl-best-practice.md b/docs/source_en/Multi-Modal/internvl-best-practice.md
index dbb8be6c0..4871e4562 100644
--- a/docs/source_en/Multi-Modal/internvl-best-practice.md
+++ b/docs/source_en/Multi-Modal/internvl-best-practice.md
@@ -327,6 +327,23 @@ The **InternVL2** model supports training with video datasets without the need t
 {"query": "Describe this video in detail. Don't repeat", "response": "xxxxxxxxx", "history": [], "videos": ["video_path"]}
 ```
 
+The **InternVL2** model supports training for grounding tasks, with data referenced in the following format:
+```jsonl
+{"query": "Find <bbox>", "response": "<ref-object>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+{"query": "Find <ref-object>", "response": "<bbox>", "images": ["/coco2014/train2014/COCO_train2014_000000001507.jpg"], "objects": "[{\"caption\": \"guy in red\", \"bbox\": [138, 136, 235, 359], \"bbox_type\": \"real\", \"image\": 0}]" }
+```
+The `objects` field contains a JSON string with four fields:
+  1. **caption**: Description of the object corresponding to the bounding box.
+  2. **bbox**: Coordinates suggested as four integers (instead of floats), representing the values `x_min`, `y_min`, `x_max`, and `y_max`.
+  3. **bbox_type**: Type of bounding box. Currently, three types are supported: `real` / `norm_1000` / `norm_1`, representing actual pixel value coordinates / thousandth-scale coordinates / normalized coordinates.
+  4. **image**: The index of the corresponding image, starting from 0.
+
+This format will be converted to a format recognizable by InternVL2, specifically:
+```json
+{"query": "Find <ref>the man</ref>", "response": "<box> [[200, 200, 600, 600]] </box>"}
+```
+You can also directly input the above format, but please ensure that the coordinates use thousandth-scale coordinates.
+
 ## Inference after Fine-tuning
 Direct inference:
 ```shell
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
index 47bf3f973..e70918ec2 100644
--- a/swift/llm/utils/dataset.py
+++ b/swift/llm/utils/dataset.py
@@ -1693,7 +1693,7 @@ def preprocess_row(row):
             start_end_pairs.append(ref_exp[0:2])
 
             object_part = caption[int(start):int(end)]
-            objects.append([object_part, ref_exp[2:6]])
+            objects.append({'caption': object_part, 'bbox': ref_exp[2:6], 'bbox_type': 'real', 'image': 0})
 
         start_end_pairs.sort(key=lambda x: (x[0], x[1]))
         if has_overlap(start_end_pairs):
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
index b0ee96d6d..d6435d038 100644
--- a/swift/llm/utils/template.py
+++ b/swift/llm/utils/template.py
@@ -1544,7 +1544,8 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
         if objects:
             object_ = objects[index]
             return [
-                f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
+                f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, '
+                f'{object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
             ]
         else:
             return ['<bbox>']
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index c582d4131..7ed5521b6 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -77,10 +77,6 @@ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnai
 
 
 def load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
-    from PIL import Image, UnidentifiedImageError
-    import os
-    import base64
-    import binascii
     if isinstance(img_path, str):
         img_path = img_path.strip()
         if img_path.startswith('http'):

From 86f33201968709f8edcd0d1cb046e7e4fe1c2941 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 19:15:59 +0800
Subject: [PATCH 12/19] fix

---
 swift/llm/utils/media.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/swift/llm/utils/media.py b/swift/llm/utils/media.py
index cf68c221b..ed7c3635f 100644
--- a/swift/llm/utils/media.py
+++ b/swift/llm/utils/media.py
@@ -18,6 +18,8 @@ class MediaTag:
             'en': [('<ref-object>', '<bbox>'), ('The positions of <ref-object> is', '<bbox>'),
                    ('Find the positions of <ref-object>', '<bbox>'), ('Where is <ref-object>', '<bbox>'),
                    ('Find <ref-object>', '<bbox>'), ('Show me <ref-object>', '<bbox>'),
+                   ('Detect <ref-object>', '<bbox>'), ('Locate <ref-object>', '<bbox>'),
+                   ('Tell me the location of <ref-object>', '<bbox>'), ('Give the location of <ref-object>', '<bbox>'),
                    ('Provide the bounding box coordinate of <ref-object>', '<bbox>')],
             'zh': [('<ref-object>', '<bbox>'), ('<ref-object>的位置在图片中', '<bbox>'), ('<ref-object>在图片中', '<bbox>'),
                    ('<ref-object>在', '<bbox>'), ('找到<ref-object>的位置', '<bbox>'), ('<ref-object>在哪里', '<bbox>'),

From 12de61ba08a5b7514259bbed06663606f9c9e691 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 19:38:00 +0800
Subject: [PATCH 13/19] fix

---
 swift/llm/utils/model.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 64dd6fabc..638324d04 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -518,6 +518,7 @@ class LoRATM(NamedTuple):
         'o_proj',
     ]
     minicpm_llama = r'.*model\.layers\.(?:[0-9]|[12][0-9]|3[01])\.(?:self_attn\.(?:q_proj|k_proj|v_proj))'
+    internvl2 = r'.*(wqkv|wo|w[123]|mlp1\\.(1|3))$'
     # compat
     llama2 = llama
 
@@ -528,7 +529,7 @@ class LoRATM(NamedTuple):
 def register_model(
         model_type: str,
         model_id_or_path: Optional[str],
-        lora_target_modules: Optional[List[str]] = None,
+        lora_target_modules: Optional[Union[List[str], str]] = None,
         template: str = TemplateType.default,
         get_function: Optional[GetModelTokenizerFunction] = None,
         *,
@@ -3790,7 +3791,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_2b,
     'OpenGVLab/InternVL2-2B',
-    LoRATM.internlm2,
+    LoRATM.internvl2,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
@@ -3810,7 +3811,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_8b,
     'OpenGVLab/InternVL2-8B',
-    LoRATM.internlm2,
+    LoRATM.internvl2,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
@@ -3820,7 +3821,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_26b,
     'OpenGVLab/InternVL2-26B',
-    LoRATM.internlm2,
+    LoRATM.internvl2,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,

From a0dfc2e396502bf03f9da91f4358e6de9ec750a8 Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 20:36:14 +0800
Subject: [PATCH 14/19] lint

---
 swift/tuners/part.py            | 2 +-
 tests/tuners/test_swift_base.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/swift/tuners/part.py b/swift/tuners/part.py
index 386b83b65..ec15bfa6d 100644
--- a/swift/tuners/part.py
+++ b/swift/tuners/part.py
@@ -55,7 +55,7 @@ def load_state_dict_callback(module: nn.Module, adapter_name: str, state_dict: D
             original_state_dict = {}
             for key, value in module.state_dict().items():
                 if key in adapter_keys:
-                    original_state_dict[key] = value
+                    original_state_dict[key] = value.clone()
 
             setattr(module, f'{adapter_name}.origin', original_state_dict)
             setattr(module, f'{adapter_name}.adapter', state_dict)
diff --git a/tests/tuners/test_swift_base.py b/tests/tuners/test_swift_base.py
index cb0425218..67eab35ca 100644
--- a/tests/tuners/test_swift_base.py
+++ b/tests/tuners/test_swift_base.py
@@ -295,6 +295,9 @@ def test_part(self):
         def target_in(t: str):
             return re.fullmatch(targets, t)
 
+        model.base_model.bert.encoder.layer[0].attention.self.query.weight.data = torch.ones_like(
+            model.base_model.bert.encoder.layer[0].attention.self.query.weight.data)
+
         self.assertTrue(all([target_in(t) for t in trainable]))
         self.assertTrue(not any([target_in(t) for t in not_trainable]))
         model.save_pretrained(self.tmp_dir, adapter_name=['part'])

From 26754afed69c9586570b2b0201fa48861462aad3 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Tue, 23 Jul 2024 21:46:30 +0800
Subject: [PATCH 15/19] fux

---
 swift/llm/utils/model.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
index 638324d04..156cf101e 100644
--- a/swift/llm/utils/model.py
+++ b/swift/llm/utils/model.py
@@ -518,7 +518,9 @@ class LoRATM(NamedTuple):
         'o_proj',
     ]
     minicpm_llama = r'.*model\.layers\.(?:[0-9]|[12][0-9]|3[01])\.(?:self_attn\.(?:q_proj|k_proj|v_proj))'
-    internvl2 = r'.*(wqkv|wo|w[123]|mlp1\\.(1|3))$'
+    internvl2 = r'.*(wqkv|wo|w[123]|mlp1\.(1|3))$'
+    internvl2_llama = r'.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj|mlp1\.(1|3))$'
+    internvl2_phi3 = r'.*(qkv_proj|o_proj|gate_up_proj|down_proj|mlp1\.(1|3))$'
     # compat
     llama2 = llama
 
@@ -3781,7 +3783,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_1b,
     'OpenGVLab/InternVL2-1B',
-    LoRATM.llama,
+    LoRATM.internvl2_llama,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
@@ -3801,7 +3803,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_4b,
     'OpenGVLab/InternVL2-4B',
-    LoRATM.phi3,
+    LoRATM.internvl2_phi3,
     TemplateType.internvl2_phi3,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
@@ -3831,7 +3833,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_40b,
     'OpenGVLab/InternVL2-40B',
-    LoRATM.llama,
+    LoRATM.internvl2_llama,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
@@ -3841,7 +3843,7 @@ def patch_internvl_forward(model) -> None:
 @register_model(
     ModelType.internvl2_llama3_76b,
     'OpenGVLab/InternVL2-Llama3-76B',
-    LoRATM.llama,
+    LoRATM.internvl2_llama,
     TemplateType.internvl2,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,

From 57b867cbf97a6d65740cd8f835f8fe69d878f326 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 24 Jul 2024 11:19:54 +0800
Subject: [PATCH 16/19] fix

---
 swift/llm/utils/vision_utils.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 7ed5521b6..7e77f2f39 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -150,3 +150,21 @@ def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=3
         pixel_values_list.append(pixel_values)
     pixel_values = torch.cat(pixel_values_list)
     return pixel_values, num_patches_list
+
+
+def draw_plot(img_dir: str, bbox: List[int], bbox_type: str, output_file: str):
+    from PIL import Image, ImageDraw
+    from swift.llm.utils.template import Template
+    image = Image.open(img_dir)
+
+    objects = [{'bbox': bbox, 'bbox_type': bbox_type, 'image': 0}]
+    Template.normalize_bbox(objects, [image], 'real')
+    bbox = objects[0]['bbox']
+    draw = ImageDraw.Draw(image)
+    draw.rectangle(bbox, outline="red", width=2)
+    image.save(output_file)
+
+
+if __name__ == '__main__':
+    # A test main to draw bbox
+    draw_plot('/mnt/workspace/man.jpg', [354, 462, 580, 738], 'norm_1000', '/mnt/workspace/man_bbox.jpg')
\ No newline at end of file

From fdb68a6909027cf3057914b542a155c1bd55380d Mon Sep 17 00:00:00 2001
From: "yuze.zyz" <yuze.zyz@alibaba-inc.com>
Date: Wed, 24 Jul 2024 11:39:33 +0800
Subject: [PATCH 17/19] lint

---
 swift/llm/export.py             | 27 +++++++++++++++++++++++----
 swift/llm/utils/vision_utils.py |  4 ++--
 tests/llm/test_ollama_export.py |  1 -
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/swift/llm/export.py b/swift/llm/export.py
index 507b6d79d..097c29812 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -1,6 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-from typing import Optional
+from typing import List, Optional
 
 import json
 import torch
@@ -93,6 +93,25 @@ def gptq_model_quantize(model, tokenizer):
     return gptq_quantizer
 
 
+def replace_and_concat(template: 'Template', template_list: List, placeholder: str, keyword: str):
+    final_str = ''
+    for t in template_list:
+        if isinstance(t, str):
+            final_str += t.replace(placeholder, keyword)
+        elif isinstance(t, (tuple, list)):
+            if isinstance(t[0], int):
+                final_str += template.tokenizer.decode(t)
+            else:
+                for attr in t:
+                    if attr == 'bos_token_id':
+                        final_str += template.tokenizer.bos_token
+                    elif attr == 'eos_token_id':
+                        final_str += template.tokenizer.eos_token
+                    else:
+                        raise ValueError(f'Unknown token: {attr}')
+    return final_str
+
+
 def llm_export(args: ExportArguments) -> None:
     global _args, template
     logger.info(f'args: {args}')
@@ -131,14 +150,14 @@ def llm_export(args: ExportArguments) -> None:
         with open(os.path.join(args.ollama_output_dir, 'Modelfile'), 'w') as f:
             f.write(f'FROM {model_dir}\n')
             f.write(f'TEMPLATE """{{{{ if .System }}}}'
-                    f'{template.system_prefix[0].replace("{{SYSTEM}}", "{{ .System }}")}'
+                    f'{replace_and_concat(template, template.system_prefix, "{{SYSTEM}}", "{{ .System }}")}'
                     f'{{{{ end }}}}')
             f.write(f'{{{{ if .Prompt }}}}'
-                    f'{template.prompt[0].replace("{{QUERY}}", "{{ .Prompt }}")}'
+                    f'{replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
                     f'{{{{ end }}}}')
             f.write('{{ .Response }}')
             f.write(template.suffix[0] + '"""\n')
-            f.write(f'PARAMETER stop "{template.suffix[0]}"\n')
+            f.write(f'PARAMETER stop "{replace_and_concat(template, template.suffix, "", "")}"\n')
             if args.stop_words:
                 for stop_word in args.stop_words:
                     f.write(f'PARAMETER stop "{stop_word}"\n')
diff --git a/swift/llm/utils/vision_utils.py b/swift/llm/utils/vision_utils.py
index 7e77f2f39..aeb1cd485 100644
--- a/swift/llm/utils/vision_utils.py
+++ b/swift/llm/utils/vision_utils.py
@@ -161,10 +161,10 @@ def draw_plot(img_dir: str, bbox: List[int], bbox_type: str, output_file: str):
     Template.normalize_bbox(objects, [image], 'real')
     bbox = objects[0]['bbox']
     draw = ImageDraw.Draw(image)
-    draw.rectangle(bbox, outline="red", width=2)
+    draw.rectangle(bbox, outline='red', width=2)
     image.save(output_file)
 
 
 if __name__ == '__main__':
     # A test main to draw bbox
-    draw_plot('/mnt/workspace/man.jpg', [354, 462, 580, 738], 'norm_1000', '/mnt/workspace/man_bbox.jpg')
\ No newline at end of file
+    draw_plot('/mnt/workspace/man.jpg', [354, 462, 580, 738], 'norm_1000', '/mnt/workspace/man_bbox.jpg')
diff --git a/tests/llm/test_ollama_export.py b/tests/llm/test_ollama_export.py
index a0a20b765..93293872d 100644
--- a/tests/llm/test_ollama_export.py
+++ b/tests/llm/test_ollama_export.py
@@ -35,7 +35,6 @@ def test_llama3(self):
             self.assertTrue(template in content)
             self.assertTrue(stop in content)
 
-    @unittest.skip('TODO FIX')
     def test_glm4(self):
         args = ExportArguments(model_type='glm4-9b-chat', to_ollama=True, ollama_output_dir=self.tmp_dir)
         export_main(args)

From 225364ee57013a37b4e00836b69e54381a30d8ad Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 24 Jul 2024 13:47:57 +0800
Subject: [PATCH 18/19] fix

---
 swift/llm/export.py             |  3 ++-
 tests/llm/test_ollama_export.py | 21 +++++++++------------
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/swift/llm/export.py b/swift/llm/export.py
index 097c29812..e800db668 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -151,12 +151,13 @@ def llm_export(args: ExportArguments) -> None:
             f.write(f'FROM {model_dir}\n')
             f.write(f'TEMPLATE """{{{{ if .System }}}}'
                     f'{replace_and_concat(template, template.system_prefix, "{{SYSTEM}}", "{{ .System }}")}'
+                    f'{{{{ else }}}}{replace_and_concat(template, template.prefix, "", "")}'
                     f'{{{{ end }}}}')
             f.write(f'{{{{ if .Prompt }}}}'
                     f'{replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
                     f'{{{{ end }}}}')
             f.write('{{ .Response }}')
-            f.write(template.suffix[0] + '"""\n')
+            f.write(replace_and_concat(template, template.suffix, "", "") + '"""\n')
             f.write(f'PARAMETER stop "{replace_and_concat(template, template.suffix, "", "")}"\n')
             if args.stop_words:
                 for stop_word in args.stop_words:
diff --git a/tests/llm/test_ollama_export.py b/tests/llm/test_ollama_export.py
index 93293872d..991d848ba 100644
--- a/tests/llm/test_ollama_export.py
+++ b/tests/llm/test_ollama_export.py
@@ -23,10 +23,10 @@ def test_llama3(self):
         args = ExportArguments(model_type='llama3-8b-instruct', to_ollama=True, ollama_output_dir=self.tmp_dir)
         export_main(args)
 
-        template = 'TEMPLATE """{{ if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n'
-        '{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n'
-        '{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
-        '{{ end }}{{ .Response }}<|eot_id|>"""'
+        template = ('TEMPLATE """{{ if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n'
+                    '{{ .System }}<|eot_id|>{{ else }}<|begin_of_text|>{{ end }}{{ if .Prompt }}<|start_header_id|>user'
+                    '<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'
+                    '{{ end }}{{ .Response }}<|eot_id|>"""')
 
         stop = 'PARAMETER stop "<|eot_id|>"'
 
@@ -39,10 +39,8 @@ def test_glm4(self):
         args = ExportArguments(model_type='glm4-9b-chat', to_ollama=True, ollama_output_dir=self.tmp_dir)
         export_main(args)
 
-        template = 'TEMPLATE """{{ if .System }}<|system|>\n'
-        '{{ .System }}{{ end }}{{ if .Prompt }}<|user|>\n'
-        '{{ .Prompt }}<|assistant|>\n'
-        '{{ end }}{{ .Response }}<|user|>"""'
+        template = ('TEMPLATE """{{ if .System }}[gMASK] <sop><|system|>{{ .System }}{{ else }}[gMASK] <sop>'
+                    '{{ end }}{{ if .Prompt }}<|user|>{{ .Prompt }}<|assistant|>{{ end }}{{ .Response }}<|user|>"""')
 
         stop = 'PARAMETER stop "<|user|>"'
 
@@ -55,10 +53,9 @@ def test_qwen2(self):
         args = ExportArguments(model_type='qwen2-7b-instruct', to_ollama=True, ollama_output_dir=self.tmp_dir)
         export_main(args)
 
-        template = 'TEMPLATE """{{ if .System }}<|im_start|>system\n'
-        '{{ .System }}<|im_end|>\n{{ end }}{{ if .Prompt }}<|im_start|>user\n'
-        '{{ .Prompt }}<|im_end|>\n<|im_start|>assistant\n'
-        '{{ end }}{{ .Response }}<|im_end|>"""'
+        template = ('TEMPLATE """{{ if .System }}<|im_start|>system\n{{ .System }}<|im_end|>\n{{ else }}{{ end }}'
+                    '{{ if .Prompt }}<|im_start|>user\n{{ .Prompt }}<|im_end|>\n<|im_start|>assistant\n'
+                    '{{ end }}{{ .Response }}<|im_end|>"""')
 
         stop = 'PARAMETER stop "<|im_end|>"'
 

From 3cfba503ccf3ebe5d98169ac4218a5e40203cc08 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 24 Jul 2024 13:56:40 +0800
Subject: [PATCH 19/19] fix

---
 swift/llm/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/swift/llm/export.py b/swift/llm/export.py
index e800db668..d2246da38 100644
--- a/swift/llm/export.py
+++ b/swift/llm/export.py
@@ -157,7 +157,7 @@ def llm_export(args: ExportArguments) -> None:
                     f'{replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
                     f'{{{{ end }}}}')
             f.write('{{ .Response }}')
-            f.write(replace_and_concat(template, template.suffix, "", "") + '"""\n')
+            f.write(replace_and_concat(template, template.suffix, '', '') + '"""\n')
             f.write(f'PARAMETER stop "{replace_and_concat(template, template.suffix, "", "")}"\n')
             if args.stop_words:
                 for stop_word in args.stop_words: