Skip to content

Commit

Permalink
Add test
Browse files Browse the repository at this point in the history
  • Loading branch information
okotaku committed Feb 11, 2024
1 parent a474b1a commit c44d950
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 247 deletions.
2 changes: 1 addition & 1 deletion diffengine/datasets/transforms/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,7 +964,7 @@ def transform(self, results: dict) -> dict | tuple[list, list] | None:
results (dict): The result dict.
"""
assert not isinstance(results[self.key], list), (
"CLIPImageProcessor only support single image.")
"TransformersImageProcessor only support single image.")
# (1, 3, 224, 224) -> (3, 224, 224)
results[self.output_key] = self.pipeline(
images=results[self.key], return_tensors="pt").pixel_values[0]
Expand Down
33 changes: 19 additions & 14 deletions tests/configs/ip_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from diffusers.models.embeddings import ImageProjection
from transformers import (
AutoTokenizer,
CLIPImageProcessor,
CLIPTextModel,
CLIPTextModelWithProjection,
CLIPVisionModelWithProjection,
Expand All @@ -16,28 +17,32 @@

base_model = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
model = dict(type=IPAdapterXL,
model=base_model,
tokenizer_one=dict(type=AutoTokenizer.from_pretrained,
model=base_model,
tokenizer_one=dict(type=AutoTokenizer.from_pretrained,
subfolder="tokenizer",
use_fast=False),
tokenizer_two=dict(type=AutoTokenizer.from_pretrained,
tokenizer_two=dict(type=AutoTokenizer.from_pretrained,
subfolder="tokenizer_2",
use_fast=False),
scheduler=dict(type=DDPMScheduler.from_pretrained,
scheduler=dict(type=DDPMScheduler.from_pretrained,
subfolder="scheduler"),
text_encoder_one=dict(type=CLIPTextModel.from_pretrained,
subfolder="text_encoder"),
text_encoder_two=dict(type=CLIPTextModelWithProjection.from_pretrained,
subfolder="text_encoder_2"),
vae=dict(
text_encoder_one=dict(type=CLIPTextModel.from_pretrained,
subfolder="text_encoder"),
text_encoder_two=dict(type=CLIPTextModelWithProjection.from_pretrained,
subfolder="text_encoder_2"),
vae=dict(
type=AutoencoderKL.from_pretrained,
subfolder="vae"),
unet=dict(type=UNet2DConditionModel.from_pretrained,
subfolder="unet"),
image_encoder=dict(type=CLIPVisionModelWithProjection.from_pretrained,
unet=dict(type=UNet2DConditionModel.from_pretrained,
subfolder="unet"),
image_encoder=dict(type=CLIPVisionModelWithProjection.from_pretrained,
pretrained_model_name_or_path="hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_encoder"),
image_projection=dict(type=ImageProjection,
num_image_text_embeds=4),
image_projection=dict(type=ImageProjection,
num_image_text_embeds=4),
feature_extractor=dict(
type=CLIPImageProcessor.from_pretrained,
pretrained_model_name_or_path="hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_processor"),
data_preprocessor=dict(type=IPAdapterXLDataPreprocessor),
loss=dict(type=L2Loss))
33 changes: 33 additions & 0 deletions tests/test_datasets/test_transforms/test_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,36 @@ def test_transform_list(self):
self.assertListEqual(data["micro_conds"],
[[32, 32, 0, 0, 6.0],
[48, 48, 10, 10, 6.0]])


class TestTransformersImageProcessor(TestCase):

def test_register(self):
assert "TransformersImageProcessor" in TRANSFORMS

def test_transform(self):
img_path = osp.join(osp.dirname(__file__), "../../testdata/color.jpg")
data = {
"img": Image.open(img_path),
}

# test transform
trans = TRANSFORMS.build(dict(type="TransformersImageProcessor",
pretrained="facebook/dinov2-small"))
data = trans(data)
assert "clip_img" in data
assert type(data["clip_img"]) == torch.Tensor
assert data["clip_img"].size() == (3, 224, 224)

def test_transform_list(self):
img_path = osp.join(osp.dirname(__file__), "../../testdata/color.jpg")
data = {
"img": [Image.open(img_path), Image.open(img_path)],
}

# test transform
trans = TRANSFORMS.build(dict(type="TransformersImageProcessor",
pretrained="facebook/dinov2-small"))
with pytest.raises(
AssertionError, match="TransformersImageProcessor only support"):
_ = trans(data)
138 changes: 22 additions & 116 deletions tests/test_models/test_editors/test_ip_adapter/test_ip_adapter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
from unittest import TestCase

import numpy as np
import pytest
import torch
from diffusers import (
AutoencoderKL,
DDPMScheduler,
DiffusionPipeline,
UNet2DConditionModel,
)
from diffusers.models.embeddings import ImageProjection
from diffusers.utils import load_image
from mmengine.optim import OptimWrapper
from PIL import Image
from torch.optim import SGD
from transformers import (
AutoTokenizer,
Expand All @@ -22,140 +18,50 @@
CLIPVisionModelWithProjection,
)

from diffengine.models.archs import process_ip_adapter_state_dict
from diffengine.models.editors import IPAdapterXL as Base
from diffengine.models.editors import IPAdapterXLDataPreprocessor
from diffengine.models.editors import IPAdapterXL, IPAdapterXLDataPreprocessor
from diffengine.models.losses import L2Loss
from diffengine.registry import MODELS


class IPAdapterXL(Base):
@torch.no_grad()
def infer(self,
prompt: list[str],
example_image: list[str | Image.Image],
negative_prompt: str | None = None,
height: int | None = None,
width: int | None = None,
num_inference_steps: int = 50,
output_type: str = "pil",
**kwargs) -> list[np.ndarray]:
"""Inference function.
Args:
----
prompt (`List[str]`):
The prompt or prompts to guide the image generation.
example_image (`List[Union[str, Image.Image]]`):
The image prompt or prompts to guide the image generation.
negative_prompt (`Optional[str]`):
The prompt or prompts to guide the image generation.
Defaults to None.
height (int, optional):
The height in pixels of the generated image. Defaults to None.
width (int, optional):
The width in pixels of the generated image. Defaults to None.
num_inference_steps (int): Number of inference steps.
Defaults to 50.
output_type (str): The output format of the generate image.
Choose between 'pil' and 'latent'. Defaults to 'pil'.
**kwargs: Other arguments.
"""
assert len(prompt) == len(example_image)

orig_encoder_hid_proj = self.unet.encoder_hid_proj
orig_encoder_hid_dim_type = self.unet.config.encoder_hid_dim_type

pipeline = DiffusionPipeline.from_pretrained(
self.model,
vae=self.vae,
text_encoder=self.text_encoder_one,
text_encoder_2=self.text_encoder_two,
tokenizer=self.tokenizer_one,
tokenizer_2=self.tokenizer_two,
unet=self.unet,
image_encoder=self.image_encoder,
feature_extractor=CLIPImageProcessor.from_pretrained(
"hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_processor"),
torch_dtype=(torch.float16 if self.device != torch.device("cpu")
else torch.float32),
)
adapter_state_dict = process_ip_adapter_state_dict(
self.unet, self.image_projection)
pipeline.load_ip_adapter(
pretrained_model_name_or_path_or_dict=adapter_state_dict,
subfolder="", weight_name="")
if self.prediction_type is not None:
# set prediction_type of scheduler if defined
scheduler_args = {"prediction_type": self.prediction_type}
pipeline.scheduler = pipeline.scheduler.from_config(
pipeline.scheduler.config, **scheduler_args)
pipeline.to(self.device)
pipeline.set_progress_bar_config(disable=True)
images = []
for p, img in zip(prompt, example_image, strict=True):
pil_img = load_image(img) if isinstance(img, str) else img
pil_img = pil_img.convert("RGB")

image = pipeline(
p,
ip_adapter_image=pil_img,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
height=height,
width=width,
output_type=output_type,
**kwargs).images[0]
if output_type == "latent":
images.append(image)
else:
images.append(np.array(image))

del pipeline, adapter_state_dict
torch.cuda.empty_cache()

self.unet.encoder_hid_proj = orig_encoder_hid_proj
self.unet.config.encoder_hid_dim_type = orig_encoder_hid_dim_type

return images


class TestIPAdapterXL(TestCase):

def _get_config(self) -> dict:
base_model = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
return dict(type=IPAdapterXL,
model=base_model,
tokenizer_one=dict(type=AutoTokenizer.from_pretrained,
model=base_model,
tokenizer_one=dict(type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="tokenizer",
use_fast=False),
tokenizer_two=dict(type=AutoTokenizer.from_pretrained,
tokenizer_two=dict(type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="tokenizer_2",
use_fast=False),
scheduler=dict(type=DDPMScheduler.from_pretrained,
scheduler=dict(type=DDPMScheduler.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="scheduler"),
text_encoder_one=dict(type=CLIPTextModel.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="text_encoder"),
text_encoder_two=dict(type=CLIPTextModelWithProjection.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="text_encoder_2"),
vae=dict(
text_encoder_one=dict(type=CLIPTextModel.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="text_encoder"),
text_encoder_two=dict(type=CLIPTextModelWithProjection.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="text_encoder_2"),
vae=dict(
type=AutoencoderKL.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="vae"),
unet=dict(type=UNet2DConditionModel.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="unet"),
image_encoder=dict(type=CLIPVisionModelWithProjection.from_pretrained,
unet=dict(type=UNet2DConditionModel.from_pretrained,
pretrained_model_name_or_path=base_model,
subfolder="unet"),
image_encoder=dict(type=CLIPVisionModelWithProjection.from_pretrained,
pretrained_model_name_or_path="hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_encoder"),
image_projection=dict(type=ImageProjection,
num_image_text_embeds=4),
image_projection=dict(type=ImageProjection,
num_image_text_embeds=4),
feature_extractor=dict(
type=CLIPImageProcessor.from_pretrained,
pretrained_model_name_or_path="hf-internal-testing/unidiffuser-diffusers-test",
subfolder="image_processor"),
data_preprocessor=dict(type=IPAdapterXLDataPreprocessor),
loss=dict(type=L2Loss))

Expand Down
Loading

0 comments on commit c44d950

Please sign in to comment.