Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support internvl2 grounding #1473

Merged
merged 23 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix
  • Loading branch information
tastelikefeet committed Jul 23, 2024
commit 231793f688e42aede26a7586aa9756b0c437b9ea
57 changes: 27 additions & 30 deletions swift/llm/utils/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def preprocess(self, example):
example[media_key] = [example[media_key]]

# Parse <img></img> format images and merged into images key
example['query'], example['history'], images_path = replace_img_tag(example['query'], history, self.image_placeholder)
example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
self.image_placeholder)
if images_path:
images = example.get('images', [])
images.extend(images_path)
Expand Down Expand Up @@ -565,8 +566,8 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
continue
width, height = image.width, image.height
object['bbox'] = [
int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim for coord, dim in
zip(bbox, [width, height, width, height])
int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
for coord, dim in zip(bbox, [width, height, width, height])
]
elif bbox_type == 'norm_1000':
if to_type == 'norm_1000':
Expand All @@ -576,8 +577,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
elif to_type == 'real':
width, height = image.width, image.height
object['bbox'] = [
int(coord / 999. * dim) for coord, dim in
zip(bbox, [width, height, width, height])
int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])
]
elif bbox_type == 'norm_1':
if to_type == 'norm_1':
Expand All @@ -586,10 +586,7 @@ def normalize_bbox(cls, objects, images, to_type: Literal['real', 'norm_1000', '
object['bbox'] = [int(coord * 999) for coord in bbox]
elif to_type == 'real':
width, height = image.width, image.height
object['bbox'] = [
int(coord * dim) for coord, dim in
zip(bbox, [width, height, width, height])
]
object['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])]

def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
**kwargs) -> Tuple[List[Context], List[float]]:
Expand Down Expand Up @@ -786,7 +783,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
labels[0] = F.pad(labels[0], (0, padding_len) if padding_right else (padding_len, 0), 'constant', -100)
if loss_scale:
loss_scale[0] = F.pad(loss_scale[0], (0, padding_to - labels[0].shape[-1]) if padding_right else
(padding_to - labels[0].shape[-1], 0), 'constant', 0.)
(padding_to - labels[0].shape[-1], 0), 'constant', 0.)

if input_ids is None:
inputs_embeds = self.pad_sequence(inputs_embeds, 0, self.padding_side)
Expand Down Expand Up @@ -873,15 +870,15 @@ def _get_safe_print_idx(cls, response: str, print_idx: int, is_finished: bool =
return print_idx

def generate_ids_to_response(
self,
generate_ids: List[int],
is_finished: bool = True,
*,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
# only stream=True
return_delta: bool = False,
print_idx: Optional[List[int]] = None,
first_num_space: Optional[List[int]] = None,
self,
generate_ids: List[int],
is_finished: bool = True,
*,
tokenizer_kwargs: Optional[Dict[str, Any]] = None,
# only stream=True
return_delta: bool = False,
print_idx: Optional[List[int]] = None,
first_num_space: Optional[List[int]] = None,
):
if tokenizer_kwargs is None:
tokenizer_kwargs = {}
Expand Down Expand Up @@ -1535,7 +1532,9 @@ def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
objects = example.get('objects')
if objects:
object_ = objects[index]
return [f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>']
return [
f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, {object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
]
else:
return ['<bbox>']

Expand Down Expand Up @@ -1680,7 +1679,7 @@ def __init__(self):
}

def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
x1, y1, x2, y2 = example['objects'][index][1]
x1, y1, x2, y2 = example['objects'][index]['bbox']
return [f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>']

def _construct_prompts(self, text):
Expand Down Expand Up @@ -1713,7 +1712,6 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
# process bbox
if example.get('objects') is not None:
if '<ref-object>' in example['query']:
example['objects'] = json.loads(example['objects'])
example['query'] = '<OPEN_VOCABULARY_DETECTION>'
example['response'] = ''
for idx in range(len(example['objects'])):
Expand All @@ -1722,7 +1720,6 @@ def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, An
example['query'] += example['objects'][idx][0]
example['response'] += example['objects'][idx][0] + self.replace_box(idx, example)[0]
elif '<bbox>' in example['query']:
example['objects'] = json.loads(example['objects'])
example['query'] = '<REGION_TO_DESCRIPTION>'
example['response'] = ''
for idx in range(len(example['objects'])):
Expand Down Expand Up @@ -2614,13 +2611,13 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =


def get_template(
template_type: str,
tokenizer: PreTrainedTokenizerBase,
default_system: Optional[str] = None,
max_length: Optional[int] = None,
truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
model=None,
**kwargs,
template_type: str,
tokenizer: PreTrainedTokenizerBase,
default_system: Optional[str] = None,
max_length: Optional[int] = None,
truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
model=None,
**kwargs,
) -> Template:
template_info = TEMPLATE_MAPPING[template_type]
template = deepcopy(template_info['template'])
Expand Down
2 changes: 1 addition & 1 deletion swift/llm/utils/vision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import binascii
import os
from io import BytesIO
from typing import Union, List, Callable, TypeVar
from typing import Callable, List, TypeVar, Union

import numpy as np
import requests
Expand Down
Loading