From d934cfaedf519aa43f4f039d96a1c011f0cdeb01 Mon Sep 17 00:00:00 2001 From: houleisai Date: Sat, 6 Jun 2026 22:38:21 +0800 Subject: [PATCH] Add InternVL3.5 model support --- paddleformers/transformers/__init__.py | 4 + .../transformers/auto/configuration.py | 5 + .../transformers/auto/image_processing.py | 2 + paddleformers/transformers/auto/modeling.py | 1 + paddleformers/transformers/auto/processing.py | 1 + .../transformers/internvl3_5/__init__.py | 28 + .../transformers/internvl3_5/configuration.py | 119 +++ .../internvl3_5/image_processor.py | 164 ++++ .../transformers/internvl3_5/modeling.py | 702 ++++++++++++++++++ .../transformers/internvl3_5/processor.py | 123 +++ tests/transformers/internvl3_5/__init__.py | 3 + .../transformers/internvl3_5/test_modeling.py | 168 +++++ .../internvl3_5/test_processor.py | 39 + 13 files changed, 1359 insertions(+) create mode 100644 paddleformers/transformers/internvl3_5/__init__.py create mode 100644 paddleformers/transformers/internvl3_5/configuration.py create mode 100644 paddleformers/transformers/internvl3_5/image_processor.py create mode 100644 paddleformers/transformers/internvl3_5/modeling.py create mode 100644 paddleformers/transformers/internvl3_5/processor.py create mode 100644 tests/transformers/internvl3_5/__init__.py create mode 100644 tests/transformers/internvl3_5/test_modeling.py create mode 100644 tests/transformers/internvl3_5/test_processor.py diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py index ae1c4f20c73..837d4945125 100644 --- a/paddleformers/transformers/__init__.py +++ b/paddleformers/transformers/__init__.py @@ -161,6 +161,10 @@ "paddleocr_vl.modeling": ["PaddleOCRVLForConditionalGeneration"], "paddleocr_vl.image_processor": ["PaddleOCRVLImageProcessor"], "paddleocr_vl.processor": ["PaddleOCRVLProcessor"], + "internvl3_5.configuration": ["InternVisionConfig", "InternVLChatConfig"], + "internvl3_5.modeling": ["InternVisionModel", "InternVLChatModel"], + "internvl3_5.image_processor": ["InternVLImageProcessor"], + "internvl3_5.processor": ["InternVLProcessor"], "gpt_oss.configuration": ["GptOssConfig"], "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"], "kimi_k25.vision_processor": ["KimiK25VisionProcessor"], diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py index c04e1f34a5a..c0a335076e7 100644 --- a/paddleformers/transformers/auto/configuration.py +++ b/paddleformers/transformers/auto/configuration.py @@ -61,6 +61,8 @@ ("gemma3_text", "Gemma3TextConfig"), ("glm4v_moe", "Glm4vMoeConfig"), ("glm_ocr", "GlmOcrConfig"), + ("internvl_chat", "InternVLChatConfig"), + ("intern_vit_6b", "InternVisionConfig"), ("qwen3_5", "Qwen3_5Config"), ("qwen3_5_moe", "Qwen3_5MoEConfig"), ] @@ -89,6 +91,7 @@ ("qwen3_vl_moe", "Qwen3VLMoe"), ("qwen3_vl_moe_text", "Qwen3VLMoeText"), ("glm_ocr", "GlmOcrForConditionalGeneration"), + ("internvl_chat", "InternVLChat"), ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"), ("qwen3_5", "Qwen3_5ForConditionalGeneration"), ] @@ -104,6 +107,8 @@ ("qwen2_5_vl_text", "qwen2_5_vl"), ("qwen3_vl_text", "qwen3_vl"), ("qwen3_vl_moe_text", "qwen3_vl_moe"), + ("internvl_chat", "internvl3_5"), + ("intern_vit_6b", "internvl3_5"), ] ) diff --git a/paddleformers/transformers/auto/image_processing.py b/paddleformers/transformers/auto/image_processing.py index 4244259c0e1..c878e472142 100644 --- a/paddleformers/transformers/auto/image_processing.py +++ b/paddleformers/transformers/auto/image_processing.py @@ -59,6 +59,8 @@ "qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"), "qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"), "glm_ocr": ("Glm46VImageProcessor"), + "internvl_chat": ("InternVLImageProcessor"), + "intern_vit_6b": ("InternVLImageProcessor"), } ) diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py index 11321baba1f..6dd7d0ad546 100644 --- a/paddleformers/transformers/auto/modeling.py +++ b/paddleformers/transformers/auto/modeling.py @@ -80,6 +80,7 @@ ("Gemma3", "gemma3_text"), ("Glm4vMoe", "glm4v_moe"), ("GlmOcr", "glm_ocr"), + ("InternVLChat", "internvl3_5"), ] ) diff --git a/paddleformers/transformers/auto/processing.py b/paddleformers/transformers/auto/processing.py index bca898e350d..d879ea74291 100644 --- a/paddleformers/transformers/auto/processing.py +++ b/paddleformers/transformers/auto/processing.py @@ -57,6 +57,7 @@ ("ernie4_5_moe_vl", "Ernie4_5_VLProcessor"), ("glm4v_moe", "Glm4vProcessor"), ("glm_ocr", "Glm46VProcessor"), + ("internvl_chat", "InternVLProcessor"), ] ) diff --git a/paddleformers/transformers/internvl3_5/__init__.py b/paddleformers/transformers/internvl3_5/__init__.py new file mode 100644 index 00000000000..774fe5cae42 --- /dev/null +++ b/paddleformers/transformers/internvl3_5/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import sys +from typing import TYPE_CHECKING + +from ...utils.lazy_import import _LazyModule + +import_structure = { + "configuration": ["InternVisionConfig", "InternVLChatConfig"], + "image_processor": ["InternVLImageProcessor"], + "modeling": ["InternVisionModel", "InternVLChatModel"], + "processor": ["InternVLProcessor"], +} + +if TYPE_CHECKING: + from .configuration import * + from .image_processor import * + from .modeling import * + from .processor import * +else: + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + import_structure, + module_spec=__spec__, + ) diff --git a/paddleformers/transformers/internvl3_5/configuration.py b/paddleformers/transformers/internvl3_5/configuration.py new file mode 100644 index 00000000000..25aedae54a2 --- /dev/null +++ b/paddleformers/transformers/internvl3_5/configuration.py @@ -0,0 +1,119 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 OpenGVLab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import copy + +from ..configuration_utils import PretrainedConfig +from ..qwen3.configuration import Qwen3Config + +__all__ = ["InternVisionConfig", "InternVLChatConfig"] + + +class InternVisionConfig(PretrainedConfig): + model_type = "intern_vit_6b" + base_config_key = "vision_config" + + def __init__( + self, + num_channels=3, + patch_size=14, + image_size=224, + qkv_bias=False, + hidden_size=3200, + num_attention_heads=25, + intermediate_size=12800, + qk_normalization=True, + num_hidden_layers=48, + use_flash_attn=True, + hidden_act="gelu", + norm_type="rms_norm", + layer_norm_eps=1e-6, + dropout=0.0, + drop_path_rate=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=0.1, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.drop_path_rate = drop_path_rate + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.norm_type = norm_type + self.qkv_bias = qkv_bias + self.qk_normalization = qk_normalization + self.use_flash_attn = use_flash_attn + + +class InternVLChatConfig(PretrainedConfig): + model_type = "internvl_chat" + is_composition = True + sub_configs = {"vision_config": InternVisionConfig, "llm_config": Qwen3Config} + + def __init__( + self, + vision_config=None, + llm_config=None, + use_backbone_lora=0, + use_llm_lora=0, + select_layer=-1, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + ps_version="v1", + min_dynamic_patch=1, + max_dynamic_patch=6, + img_context_token_id=151671, + **kwargs, + ): + super().__init__(**kwargs) + if vision_config is None: + vision_config = {"architectures": ["InternVisionModel"]} + if llm_config is None: + llm_config = {"architectures": ["Qwen3ForCausalLM"]} + + self.vision_config = ( + InternVisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config + ) + self.llm_config = Qwen3Config(**llm_config) if isinstance(llm_config, dict) else llm_config + + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.ps_version = ps_version + self.min_dynamic_patch = min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch + self.img_context_token_id = img_context_token_id + self.tie_word_embeddings = self.llm_config.tie_word_embeddings + self.vocab_size = self.llm_config.vocab_size + self.hidden_size = self.llm_config.hidden_size + self.pad_token_id = getattr(self.llm_config, "pad_token_id", getattr(self, "pad_token_id", None)) + self.eos_token_id = getattr(self.llm_config, "eos_token_id", getattr(self, "eos_token_id", None)) + self.bos_token_id = getattr(self.llm_config, "bos_token_id", getattr(self, "bos_token_id", None)) + + def to_dict(self, *args, **kwargs): + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["llm_config"] = self.llm_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/paddleformers/transformers/internvl3_5/image_processor.py b/paddleformers/transformers/internvl3_5/image_processor.py new file mode 100644 index 00000000000..b2909e3d457 --- /dev/null +++ b/paddleformers/transformers/internvl3_5/image_processor.py @@ -0,0 +1,164 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 OpenGVLab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +from typing import List, Optional, Union + +import numpy as np +import paddle +from PIL import Image + +from ..feature_extraction_utils import BatchFeature +from ..image_processing_utils import BaseImageProcessor +from ..image_utils import ImageInput, PILImageResampling, is_valid_image, to_numpy_array + +__all__ = ["InternVLImageProcessor"] + + +IMAGENET_MEAN = [0.485, 0.456, 0.406] +IMAGENET_STD = [0.229, 0.224, 0.225] + + +def _to_pil_image(image): + if isinstance(image, Image.Image): + return image.convert("RGB") + array = to_numpy_array(image) + if array.ndim == 3 and array.shape[0] in [1, 3]: + array = np.transpose(array, (1, 2, 0)) + if array.dtype != np.uint8: + if array.max() <= 1.0: + array = array * 255 + array = array.astype("uint8") + return Image.fromarray(array).convert("RGB") + + +class InternVLImageProcessor(BaseImageProcessor): + model_input_names = ["pixel_values", "num_patches_list", "image_flags"] + + def __init__( + self, + do_resize: bool = True, + size: Optional[dict] = None, + resample: int = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + min_patches: int = 1, + max_patches: int = 12, + use_thumbnail: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.do_resize = do_resize + self.size = size if size is not None else {"height": 448, "width": 448} + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STD + self.do_convert_rgb = do_convert_rgb + self.min_patches = min_patches + self.max_patches = max_patches + self.use_thumbnail = use_thumbnail + + @staticmethod + def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + def dynamic_preprocess(self, image, min_num=None, max_num=None, image_size=None, use_thumbnail=None): + min_num = min_num if min_num is not None else self.min_patches + max_num = max_num if max_num is not None else self.max_patches + image_size = image_size if image_size is not None else self.size["height"] + use_thumbnail = use_thumbnail if use_thumbnail is not None else self.use_thumbnail + + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + target_ratios = set( + (i, j) + for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if min_num <= i * j <= max_num + ) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + target_aspect_ratio = self.find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size + ) + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + resized_img = image.resize((target_width, target_height), resample=self.resample) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + processed_images.append(resized_img.crop(box)) + if use_thumbnail and len(processed_images) != 1: + processed_images.append(image.resize((image_size, image_size), resample=self.resample)) + return processed_images + + def _preprocess_tile(self, image): + image_size = self.size["height"] + if self.do_resize: + image = image.resize((image_size, image_size), resample=self.resample) + array = np.asarray(image).astype("float32") + if self.do_rescale: + array = array * self.rescale_factor + if self.do_normalize: + mean = np.asarray(self.image_mean, dtype="float32") + std = np.asarray(self.image_std, dtype="float32") + array = (array - mean) / std + return np.transpose(array, (2, 0, 1)) + + def __call__(self, images: ImageInput = None, return_tensors=None, **kwargs): + if images is None: + return BatchFeature(data={}) + if is_valid_image(images): + images = [images] + if not isinstance(images, (list, tuple)) or not all(is_valid_image(image) for image in images): + raise ValueError("InternVLImageProcessor expects an image or a list of images.") + + pixel_values = [] + num_patches_list = [] + for image in images: + pil_image = _to_pil_image(image) + tiles = self.dynamic_preprocess( + pil_image, + min_num=kwargs.pop("min_patches", self.min_patches), + max_num=kwargs.pop("max_patches", self.max_patches), + image_size=kwargs.pop("image_size", self.size["height"]), + use_thumbnail=kwargs.pop("use_thumbnail", self.use_thumbnail), + ) + num_patches_list.append(len(tiles)) + pixel_values.extend([self._preprocess_tile(tile) for tile in tiles]) + + data = { + "pixel_values": np.stack(pixel_values).astype("float32"), + "num_patches_list": num_patches_list, + "image_flags": np.ones([len(pixel_values), 1], dtype="int64"), + } + if return_tensors == "pd": + data["pixel_values"] = paddle.to_tensor(data["pixel_values"]) + data["image_flags"] = paddle.to_tensor(data["image_flags"]) + return BatchFeature(data=data) diff --git a/paddleformers/transformers/internvl3_5/modeling.py b/paddleformers/transformers/internvl3_5/modeling.py new file mode 100644 index 00000000000..f00e81ed498 --- /dev/null +++ b/paddleformers/transformers/internvl3_5/modeling.py @@ -0,0 +1,702 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 OpenGVLab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import math +from typing import Optional, Tuple, Union + +import paddle +import paddle.nn.functional as F +from paddle import nn + +from ..activations import ACT2FN +from ..model_outputs import BaseModelOutput, BaseModelOutputWithPast, BaseModelOutputWithPooling, CausalLMOutputWithPast +from ..model_utils import PretrainedModel +from .configuration import InternVisionConfig, InternVLChatConfig + +__all__ = ["InternVisionModel", "InternVLChatModel"] + + +def rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return paddle.concat([-x2, x1], axis=-1) + + +def repeat_kv(hidden_states, n_rep): + if n_rep == 1: + return hidden_states + batch, num_key_value_heads, seq_len, head_dim = hidden_states.shape + hidden_states = hidden_states[:, :, None, :, :].expand([batch, num_key_value_heads, n_rep, seq_len, head_dim]) + return hidden_states.reshape([batch, num_key_value_heads * n_rep, seq_len, head_dim]) + + +class Qwen3RMSNorm(nn.Layer): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = self.create_parameter([hidden_size], default_initializer=nn.initializer.Constant(1.0)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.astype("float32") + variance = hidden_states.pow(2).mean(axis=-1, keepdim=True) + hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) + return self.weight.astype(input_dtype) * hidden_states.astype(input_dtype) + + +class InternVLQwen3Attention(nn.Layer): + def __init__(self, config, layer_idx=0): + super().__init__() + self.layer_idx = layer_idx + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.rope_theta = config.rope_theta + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias_attr=config.attention_bias) + self.k_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=config.attention_bias + ) + self.v_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias_attr=config.attention_bias + ) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias_attr=config.attention_bias) + self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) + inv_freq = 1.0 / ( + self.rope_theta ** (paddle.arange(0, self.head_dim, 2, dtype="float32") / self.head_dim) + ) + self.register_buffer("inv_freq", inv_freq, persistable=False) + + def _get_cos_sin(self, position_ids, dtype): + freqs = paddle.einsum("bi,j->bij", position_ids.astype("float32"), self.inv_freq) + emb = paddle.concat([freqs, freqs], axis=-1) + return paddle.cos(emb).astype(dtype), paddle.sin(emb).astype(dtype) + + def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs): + batch_size, seq_len, _ = hidden_states.shape + if position_ids is None: + position_ids = paddle.arange(seq_len, dtype="int64").unsqueeze(0).expand([batch_size, seq_len]) + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.reshape([batch_size, seq_len, self.num_heads, self.head_dim]).transpose( + [0, 2, 1, 3] + ) + key_states = key_states.reshape([batch_size, seq_len, self.num_key_value_heads, self.head_dim]).transpose( + [0, 2, 1, 3] + ) + value_states = value_states.reshape([batch_size, seq_len, self.num_key_value_heads, self.head_dim]).transpose( + [0, 2, 1, 3] + ) + + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + cos, sin = self._get_cos_sin(position_ids, query_states.dtype) + cos = cos.unsqueeze(1) + sin = sin.unsqueeze(1) + query_states = (query_states * cos) + (rotate_half(query_states) * sin) + key_states = (key_states * cos) + (rotate_half(key_states) * sin) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = paddle.matmul(query_states * self.scaling, key_states.transpose([0, 1, 3, 2])) + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = F.softmax(attn_weights.astype("float32"), axis=-1).astype(query_states.dtype) + attn_weights = F.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = paddle.matmul(attn_weights, value_states) + attn_output = attn_output.transpose([0, 2, 1, 3]).reshape([batch_size, seq_len, self.num_heads * self.head_dim]) + return self.o_proj(attn_output), None, None + + +class InternVLQwen3MLP(nn.Layer): + def __init__(self, config): + super().__init__() + self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias_attr=False) + self.up_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias_attr=False) + self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias_attr=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +class InternVLQwen3DecoderLayer(nn.Layer): + def __init__(self, config, layer_idx): + super().__init__() + self.self_attn = InternVLQwen3Attention(config, layer_idx) + self.mlp = InternVLQwen3MLP(config) + self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward(self, hidden_states, attention_mask=None, position_ids=None, **kwargs): + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + hidden_states, _, _ = self.self_attn(hidden_states, attention_mask=attention_mask, position_ids=position_ids) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + return residual + hidden_states + + +class InternVLQwen3Model(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.layers = nn.LayerList( + [InternVLQwen3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def _prepare_attention_mask(self, attention_mask, batch_size, seq_len, dtype): + causal = paddle.triu(paddle.ones([seq_len, seq_len], dtype="bool"), diagonal=1) + causal = paddle.where( + causal, + paddle.full([seq_len, seq_len], paddle.finfo(dtype).min, dtype=dtype), + paddle.zeros([seq_len, seq_len], dtype=dtype), + ) + causal = causal.reshape([1, 1, seq_len, seq_len]).expand([batch_size, 1, seq_len, seq_len]) + if attention_mask is not None: + expanded = attention_mask[:, None, None, :].astype(dtype) + padding = paddle.where( + expanded > 0, + paddle.zeros_like(expanded), + paddle.full_like(expanded, paddle.finfo(dtype).min), + ) + causal = causal + padding + return causal + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + inputs_embeds=None, + use_cache=None, + output_hidden_states=None, + return_dict=True, + **kwargs, + ): + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + batch_size, seq_len, _ = inputs_embeds.shape + if position_ids is None: + position_ids = paddle.arange(seq_len, dtype="int64").unsqueeze(0).expand([batch_size, seq_len]) + causal_mask = self._prepare_attention_mask(attention_mask, batch_size, seq_len, inputs_embeds.dtype) + hidden_states = inputs_embeds + all_hidden_states = () if output_hidden_states else None + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + hidden_states = layer(hidden_states, attention_mask=causal_mask, position_ids=position_ids) + hidden_states = self.norm(hidden_states) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if not return_dict: + return tuple(v for v in [hidden_states, None, all_hidden_states] if v is not None) + return BaseModelOutputWithPast(last_hidden_state=hidden_states, past_key_values=None, hidden_states=all_hidden_states) + + +class InternVLQwen3ForCausalLM(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.model = InternVLQwen3Model(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False) + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, value): + self.lm_head = value + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_hidden_states=None, + return_dict=True, + **kwargs, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + logits = self.lm_head(outputs.last_hidden_state) + loss = None + if labels is not None: + shift_logits = logits[..., :-1, :] + shift_labels = labels[..., 1:] + flat_logits = shift_logits.reshape([-1, self.config.vocab_size]) + flat_labels = shift_labels.reshape([-1]) + valid_mask = flat_labels != -100 + safe_labels = paddle.where(valid_mask, flat_labels, paddle.zeros_like(flat_labels)) + token_loss = F.cross_entropy(flat_logits, safe_labels, reduction="none") + token_loss = token_loss * valid_mask.astype(token_loss.dtype) + loss = token_loss.sum() / valid_mask.astype(token_loss.dtype).sum() + if not return_dict: + return ((loss, logits) if loss is not None else (logits,)) + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=None, + hidden_states=outputs.hidden_states, + ) + + +class DropPath(nn.Layer): + def __init__(self, drop_prob=0.0): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x): + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1.0 - self.drop_prob + shape = [x.shape[0]] + [1] * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + return x / keep_prob * paddle.floor(random_tensor) + + +class InternRMSNorm(nn.Layer): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = self.create_parameter([hidden_size], default_initializer=nn.initializer.Constant(1.0)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.astype("float32") + variance = paddle.mean(hidden_states.pow(2), axis=-1, keepdim=True) + hidden_states = hidden_states * paddle.rsqrt(variance + self.variance_epsilon) + return self.weight.astype(input_dtype) * hidden_states.astype(input_dtype) + + +NORM2FN = { + "rms_norm": InternRMSNorm, + "layer_norm": nn.LayerNorm, +} + + +class InternVisionEmbeddings(nn.Layer): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + self.class_embedding = self.create_parameter([1, 1, self.embed_dim]) + self.patch_embedding = nn.Conv2D( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + ) + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = self.create_parameter([1, self.num_positions, self.embed_dim]) + + def _get_pos_embed(self, pos_embed, height, width): + target_dtype = pos_embed.dtype + pos_embed = pos_embed.astype("float32").reshape( + [1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1] + ) + pos_embed = pos_embed.transpose([0, 3, 1, 2]) + pos_embed = F.interpolate(pos_embed, size=[height, width], mode="bicubic", align_corners=False) + return pos_embed.reshape([1, -1, height * width]).transpose([0, 2, 1]).astype(target_dtype) + + def forward(self, pixel_values): + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose([0, 2, 1]) + class_embeds = self.class_embedding.expand([batch_size, 1, self.embed_dim]).astype(target_dtype) + embeddings = paddle.concat([class_embeds, patch_embeds], axis=1) + position_embedding = paddle.concat( + [ + self.position_embedding[:, :1, :], + self._get_pos_embed(self.position_embedding[:, 1:, :], height, width), + ], + axis=1, + ) + return embeddings + position_embedding.astype(target_dtype) + + +class InternAttention(nn.Layer): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError("embed_dim must be divisible by num_heads") + self.scale = self.head_dim**-0.5 + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias_attr=config.qkv_bias) + self.attn_drop = nn.Dropout(config.attention_dropout) + self.proj_drop = nn.Dropout(config.dropout) + self.qk_normalization = config.qk_normalization + if self.qk_normalization: + self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward(self, hidden_states): + batch_size, seq_len, channels = hidden_states.shape + qkv = self.qkv(hidden_states).reshape( + [batch_size, seq_len, 3, self.num_heads, channels // self.num_heads] + ) + q, k, v = paddle.unbind(qkv.transpose([2, 0, 3, 1, 4]), axis=0) + + if self.qk_normalization: + q = self.q_norm(q.transpose([0, 2, 1, 3]).flatten(-2, -1)) + q = q.reshape([batch_size, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + k = self.k_norm(k.transpose([0, 2, 1, 3]).flatten(-2, -1)) + k = k.reshape([batch_size, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + + attn = paddle.matmul(q * self.scale, k.transpose([0, 1, 3, 2])) + attn = F.softmax(attn, axis=-1) + attn = self.attn_drop(attn) + x = paddle.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([batch_size, seq_len, channels]) + return self.proj_drop(self.proj(x)) + + +class InternMLP(nn.Layer): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.act = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states): + return self.fc2(self.act(self.fc1(hidden_states))) + + +class InternVisionEncoderLayer(nn.Layer): + def __init__(self, config: InternVisionConfig, drop_path_rate: float): + super().__init__() + self.embed_dim = config.hidden_size + self.attn = InternAttention(config) + self.mlp = InternMLP(config) + self.norm1 = NORM2FN[config.norm_type](self.embed_dim, eps=config.layer_norm_eps) + self.norm2 = NORM2FN[config.norm_type](self.embed_dim, eps=config.layer_norm_eps) + self.ls1 = self.create_parameter( + [self.embed_dim], default_initializer=nn.initializer.Constant(config.initializer_factor) + ) + self.ls2 = self.create_parameter( + [self.embed_dim], default_initializer=nn.initializer.Constant(config.initializer_factor) + ) + self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + def forward(self, hidden_states): + hidden_states = hidden_states + self.drop_path1( + self.attn(self.norm1(hidden_states).astype(hidden_states.dtype)) * self.ls1 + ) + hidden_states = hidden_states + self.drop_path2( + self.mlp(self.norm2(hidden_states).astype(hidden_states.dtype)) * self.ls2 + ) + return hidden_states + + +class InternVisionEncoder(nn.Layer): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + dpr = paddle.linspace(0, config.drop_path_rate, config.num_hidden_layers).tolist() + self.layers = nn.LayerList([InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)]) + + def forward(self, inputs_embeds, output_hidden_states=False, return_dict=True): + encoder_states = () if output_hidden_states else None + hidden_states = inputs_embeds + for layer in self.layers: + if output_hidden_states: + encoder_states += (hidden_states,) + hidden_states = layer(hidden_states) + if output_hidden_states: + encoder_states += (hidden_states,) + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states] if v is not None) + return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states) + + +class InternVisionPretrainedModel(PretrainedModel): + config_class = InternVisionConfig + base_model_prefix = "vision_model" + transpose_weight_keys = ["qkv", "proj", "fc1", "fc2"] + + +class InternVisionModel(InternVisionPretrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: InternVisionConfig): + super().__init__(config) + self.embeddings = InternVisionEmbeddings(config) + self.encoder = InternVisionEncoder(config) + + def get_input_embeddings(self): + return self.embeddings + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_embeds: Optional[paddle.Tensor] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = True if return_dict is None else return_dict + if pixel_values is None and pixel_embeds is None: + raise ValueError("You have to specify pixel_values or pixel_embeds") + hidden_states = pixel_embeds if pixel_embeds is not None else self.embeddings(pixel_values) + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=None, + ) + + +class InternVLChatPretrainedModel(PretrainedModel): + config_class = InternVLChatConfig + base_model_prefix = "language_model" + transpose_weight_keys = [ + "qkv", + "proj", + "fc1", + "fc2", + "q_proj", + "k_proj", + "v_proj", + "o_proj", + "gate_proj", + "up_proj", + "down_proj", + "lm_head", + "mlp1.1", + "mlp1.3", + ] + + +class InternVLChatModel(InternVLChatPretrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None): + super().__init__(config) + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + self.select_layer = config.select_layer + self.template = config.template + self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio**2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + self.img_context_token_id = config.img_context_token_id + + self.vision_model = vision_model if vision_model is not None else InternVisionModel(config.vision_config) + self.language_model = language_model if language_model is not None else InternVLQwen3ForCausalLM(config.llm_config) + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.llm_config.hidden_size + shuffle_scale = int(1 / self.downsample_ratio) ** 2 + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * shuffle_scale), + nn.Linear(vit_hidden_size * shuffle_scale, llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size), + ) + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.shape + x = x.reshape([n, w, int(h * scale_factor), int(c / scale_factor)]) + x = x.transpose([0, 2, 1, 3]) + x = x.reshape([n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))]) + if self.ps_version != "v1": + x = x.transpose([0, 2, 1, 3]) + return x + + def extract_feature(self, pixel_values): + if self.select_layer == -1: + vit_embeds = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=False, + return_dict=True, + ).last_hidden_state + else: + vit_embeds = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True, + ).hidden_states[self.select_layer] + vit_embeds = vit_embeds[:, 1:, :] + h = w = int(math.sqrt(vit_embeds.shape[1])) + vit_embeds = vit_embeds.reshape([vit_embeds.shape[0], h, w, -1]) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape([vit_embeds.shape[0], -1, vit_embeds.shape[-1]]) + return self.mlp1(vit_embeds) + + def _merge_visual_embeds(self, input_ids, input_embeds, vit_embeds): + batch_size, seq_len, hidden_size = input_embeds.shape + flat_embeds = input_embeds.reshape([batch_size * seq_len, hidden_size]) + flat_input_ids = input_ids.reshape([batch_size * seq_len]) + selected = paddle.nonzero(flat_input_ids == self.img_context_token_id).flatten() + vit_embeds = vit_embeds.reshape([-1, hidden_size]).astype(flat_embeds.dtype) + if selected.shape[0] == 0: + raise ValueError("No token found in input_ids.") + if selected.shape[0] != vit_embeds.shape[0]: + raise ValueError( + f"The number of tokens ({selected.shape[0]}) does not match " + f"visual tokens ({vit_embeds.shape[0]})." + ) + flat_embeds = paddle.scatter(flat_embeds, selected, vit_embeds, overwrite=True) + return flat_embeds.reshape([batch_size, seq_len, hidden_size]) + + def forward( + self, + pixel_values: Optional[paddle.Tensor] = None, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + position_ids: Optional[paddle.Tensor] = None, + image_flags: Optional[paddle.Tensor] = None, + past_key_values=None, + labels: Optional[paddle.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + return_dict = True if return_dict is None else return_dict + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + if pixel_values is not None: + vit_embeds = self.extract_feature(pixel_values) + if image_flags is not None: + image_flags = image_flags.squeeze(-1).astype("bool") + vit_embeds = vit_embeds[image_flags] + input_embeds = self._merge_visual_embeds(input_ids, input_embeds, vit_embeds) + + outputs = self.language_model( + input_ids=None, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=input_embeds, + labels=labels, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + return CausalLMOutputWithPast( + loss=outputs.loss, + logits=outputs.logits, + past_key_values=outputs.past_key_values, + hidden_states=getattr(outputs, "hidden_states", None), + attentions=getattr(outputs, "attentions", None), + ) + + def generate( + self, + pixel_values: Optional[paddle.Tensor] = None, + input_ids: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + visual_features: Optional[paddle.Tensor] = None, + max_new_tokens: int = 20, + eos_token_id: Optional[int] = None, + **generate_kwargs, + ): + input_embeds = self.language_model.get_input_embeddings()(input_ids) + if pixel_values is not None: + vit_embeds = visual_features if visual_features is not None else self.extract_feature(pixel_values) + input_embeds = self._merge_visual_embeds(input_ids, input_embeds, vit_embeds) + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + generated = [] + current_embeds = input_embeds + current_mask = attention_mask + for _ in range(max_new_tokens): + outputs = self.language_model(inputs_embeds=current_embeds, attention_mask=current_mask, use_cache=False) + next_token = paddle.argmax(outputs.logits[:, -1, :], axis=-1, keepdim=True) + generated.append(next_token) + next_embed = self.language_model.get_input_embeddings()(next_token) + current_embeds = paddle.concat([current_embeds, next_embed], axis=1) + if current_mask is not None: + current_mask = paddle.concat([current_mask, paddle.ones_like(next_token)], axis=1) + if eos_token_id is not None and bool(paddle.all(next_token == eos_token_id).item()): + break + return paddle.concat(generated, axis=1) if generated else paddle.empty([input_ids.shape[0], 0], dtype="int64") + + @property + def lm_head(self): + return self.language_model.get_output_embeddings() + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + return self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, value): + return self.language_model.set_output_embeddings(value) + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None): + old_output_embeddings = self.get_output_embeddings() + new_input_embeddings = super().resize_token_embeddings(new_num_tokens) + if new_num_tokens is None: + return new_input_embeddings + + old_num_tokens = old_output_embeddings.weight.shape[1] + hidden_size = old_output_embeddings.weight.shape[0] + new_output_embeddings = nn.Linear(hidden_size, new_num_tokens, bias_attr=False) + if new_output_embeddings.weight.dtype != old_output_embeddings.weight.dtype: + new_output_embeddings.to(dtype=old_output_embeddings.weight.dtype) + n = min(old_num_tokens, new_num_tokens) + with paddle.no_grad(): + new_output_embeddings.weight[:, :n] = old_output_embeddings.weight[:, :n] + self.set_output_embeddings(new_output_embeddings) + self.config.vocab_size = new_num_tokens + self.config.llm_config.vocab_size = new_num_tokens + self.language_model.config.vocab_size = new_num_tokens + return new_input_embeddings diff --git a/paddleformers/transformers/internvl3_5/processor.py b/paddleformers/transformers/internvl3_5/processor.py new file mode 100644 index 00000000000..96e5b4b9dcd --- /dev/null +++ b/paddleformers/transformers/internvl3_5/processor.py @@ -0,0 +1,123 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 OpenGVLab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +from typing import List, Union + +from ..feature_extraction_utils import BatchFeature +from ..image_utils import ImageInput +from ..processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ..tokenizer_utils_base import PreTokenizedInput, TextInput + +__all__ = ["InternVLProcessor"] + + +class InternVLProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": {"padding": False}, + "images_kwargs": {}, + } + + +class InternVLProcessor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + image_processor_class = "InternVLImageProcessor" + tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") + + def __init__( + self, + image_processor=None, + tokenizer=None, + chat_template=None, + image_seq_length=256, + image_token="", + img_start_token="", + img_end_token="", + img_context_token="", + **kwargs, + ): + self.image_seq_length = image_seq_length + self.image_token = image_token + self.img_start_token = img_start_token + self.img_end_token = img_end_token + self.img_context_token = img_context_token + self.img_context_token_id = ( + tokenizer.convert_tokens_to_ids(img_context_token) if tokenizer is not None else 151671 + ) + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + from ..auto.tokenizer import AutoTokenizer + from .image_processor import InternVLImageProcessor + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs) + image_processor = InternVLImageProcessor.from_pretrained(pretrained_model_name_or_path, **kwargs) + return cls(image_processor=image_processor, tokenizer=tokenizer) + + def _expand_image_tokens(self, text, num_patches_list): + if num_patches_list is None: + return text + patch_index = 0 + expanded = [] + for sample in text: + if self.image_token not in sample and len(num_patches_list) > 0 and len(text) == 1: + sample = self.image_token + "\n" + sample + while self.image_token in sample: + if patch_index >= len(num_patches_list): + raise ValueError("More placeholders than processed images.") + image_tokens = ( + self.img_start_token + + self.img_context_token * self.image_seq_length * num_patches_list[patch_index] + + self.img_end_token + ) + sample = sample.replace(self.image_token, image_tokens, 1) + patch_index += 1 + expanded.append(sample) + if patch_index != len(num_patches_list): + raise ValueError("The number of images does not match placeholders in text.") + return expanded + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + **kwargs: Unpack[InternVLProcessorKwargs], + ) -> BatchFeature: + output_kwargs = self._merge_kwargs( + InternVLProcessorKwargs, + tokenizer_init_kwargs=getattr(self.tokenizer, "init_kwargs", {}), + **kwargs, + ) + return_tensors = kwargs.get("return_tensors", None) + output_kwargs["images_kwargs"].pop("return_tensors", None) + output_kwargs["text_kwargs"].pop("return_tensors", None) + + image_inputs = {} + num_patches_list = None + if images is not None: + image_inputs = self.image_processor( + images=images, + return_tensors=return_tensors, + **output_kwargs["images_kwargs"], + ) + num_patches_list = image_inputs["num_patches_list"] + + if text is None: + data = dict(image_inputs) + return BatchFeature(data=data) + if not isinstance(text, list): + text = [text] + text = self._expand_image_tokens(text.copy(), num_patches_list) + + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=return_tensors) + data = dict(text_inputs) + data.update(dict(image_inputs)) + return BatchFeature(data=data) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) diff --git a/tests/transformers/internvl3_5/__init__.py b/tests/transformers/internvl3_5/__init__.py new file mode 100644 index 00000000000..777f41623db --- /dev/null +++ b/tests/transformers/internvl3_5/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/tests/transformers/internvl3_5/test_modeling.py b/tests/transformers/internvl3_5/test_modeling.py new file mode 100644 index 00000000000..a152bac6b55 --- /dev/null +++ b/tests/transformers/internvl3_5/test_modeling.py @@ -0,0 +1,168 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import tempfile +import unittest + +import paddle +import paddle.nn.functional as F + +from tests.transformers.test_configuration_common import ConfigTester + +from paddleformers.transformers import InternVLChatConfig, InternVLChatModel + + +class InternVLModelTest(unittest.TestCase): + def get_config(self): + return InternVLChatConfig( + vision_config={ + "image_size": 28, + "patch_size": 14, + "hidden_size": 16, + "intermediate_size": 32, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "qkv_bias": True, + "qk_normalization": False, + "norm_type": "layer_norm", + "drop_path_rate": 0.0, + }, + llm_config={ + "architectures": ["Qwen3ForCausalLM"], + "vocab_size": 200, + "hidden_size": 16, + "intermediate_size": 32, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "num_key_value_heads": 1, + "head_dim": 8, + "max_position_embeddings": 128, + "rms_norm_eps": 1e-6, + "rope_theta": 10000, + "attention_dropout": 0.0, + "attention_bias": False, + "hidden_act": "silu", + "use_cache": False, + "bos_token_id": 0, + "eos_token_id": 2, + "pad_token_id": 1, + }, + force_image_size=28, + downsample_ratio=0.5, + ps_version="v2", + img_context_token_id=151, + ) + + def get_inputs(self): + return { + "input_ids": paddle.to_tensor([[10, 151, 11, 12]], dtype="int64"), + "pixel_values": paddle.randn([1, 3, 28, 28]), + } + + def test_config(self): + config = self.get_config() + config_tester = ConfigTester( + self, + config_class=InternVLChatConfig, + has_text_modality=True, + common_properties=[], + vision_config=config.vision_config.to_dict(), + llm_config=config.llm_config.to_dict(), + force_image_size=config.force_image_size, + downsample_ratio=config.downsample_ratio, + ps_version=config.ps_version, + img_context_token_id=config.img_context_token_id, + ) + config_tester.create_and_test_config_from_and_save_pretrained() + + def test_forward_and_loss(self): + model = InternVLChatModel(self.get_config()).eval() + inputs = self.get_inputs() + input_ids = inputs["input_ids"] + labels = paddle.to_tensor([[-100, -100, 11, 12]], dtype="int64") + with paddle.no_grad(): + outputs = model(**inputs, labels=labels, use_cache=False) + self.assertEqual(list(outputs.logits.shape), [1, 4, 200]) + self.assertEqual(outputs.loss.ndim, 0) + shift_logits = outputs.logits[..., :-1, :].reshape([-1, 200]) + shift_labels = labels[..., 1:].reshape([-1]) + valid_mask = shift_labels != -100 + safe_labels = paddle.where(valid_mask, shift_labels, paddle.zeros_like(shift_labels)) + token_loss = F.cross_entropy(shift_logits, safe_labels, reduction="none") + expected_loss = (token_loss * valid_mask.astype(token_loss.dtype)).sum() / valid_mask.astype( + token_loss.dtype + ).sum() + paddle.testing.assert_close(outputs.loss, expected_loss) + + def test_save_load(self): + paddle.seed(42) + model = InternVLChatModel(self.get_config()).eval() + inputs = self.get_inputs() + with paddle.no_grad(): + expected = model(**inputs, use_cache=False).logits + with tempfile.TemporaryDirectory() as tmpdir: + model.save_pretrained(tmpdir, save_checkpoint_format="") + reloaded = InternVLChatModel.from_pretrained(tmpdir, load_checkpoint_format="").eval() + with paddle.no_grad(): + actual = reloaded(**inputs, use_cache=False).logits + paddle.testing.assert_close(actual, expected, atol=1e-5, rtol=1e-5) + + def test_determinism(self): + paddle.seed(42) + model = InternVLChatModel(self.get_config()).eval() + inputs = self.get_inputs() + with paddle.no_grad(): + first = model(**inputs, use_cache=False).logits + second = model(**inputs, use_cache=False).logits + paddle.testing.assert_close(second, first, atol=0.0, rtol=0.0) + + def test_hidden_states_output(self): + model = InternVLChatModel(self.get_config()).eval() + with paddle.no_grad(): + outputs = model(**self.get_inputs(), output_hidden_states=True, use_cache=False) + self.assertEqual(len(outputs.hidden_states), self.get_config().llm_config.num_hidden_layers + 1) + self.assertEqual(list(outputs.hidden_states[-1].shape), [1, 4, 16]) + + def test_resize_tokens_embeddings(self): + model = InternVLChatModel(self.get_config()).eval() + old_input_embeddings = model.get_input_embeddings().weight.detach().clone() + old_output_embeddings = model.get_output_embeddings().weight.detach().clone() + + model.resize_token_embeddings(205) + + self.assertEqual(model.config.vocab_size, 205) + self.assertEqual(model.config.llm_config.vocab_size, 205) + self.assertEqual(list(model.get_input_embeddings().weight.shape), [205, 16]) + self.assertEqual(list(model.get_output_embeddings().weight.shape), [16, 205]) + paddle.testing.assert_close(model.get_input_embeddings().weight[:200], old_input_embeddings) + paddle.testing.assert_close(model.get_output_embeddings().weight[:, :200], old_output_embeddings) + + def test_greedy_generate(self): + model = InternVLChatModel(self.get_config()).eval() + with paddle.no_grad(): + output_ids = model.generate(**self.get_inputs(), max_new_tokens=2) + self.assertEqual(list(output_ids.shape), [1, 2]) + + def test_beam_search_generate(self): + model = InternVLChatModel(self.get_config()).eval() + with paddle.no_grad(): + output_ids = model.generate(**self.get_inputs(), max_new_tokens=2, num_beams=2) + self.assertEqual(list(output_ids.shape), [1, 2]) + + def test_sample_generate(self): + model = InternVLChatModel(self.get_config()).eval() + with paddle.no_grad(): + output_ids = model.generate(**self.get_inputs(), max_new_tokens=2, do_sample=True, top_k=10) + self.assertEqual(list(output_ids.shape), [1, 2]) + + def test_mismatching_image_tokens(self): + model = InternVLChatModel(self.get_config()).eval() + input_ids = paddle.to_tensor([[10, 151, 151, 12]], dtype="int64") + pixel_values = paddle.randn([1, 3, 28, 28]) + with self.assertRaisesRegex(ValueError, "does not match"): + model(input_ids=input_ids, pixel_values=pixel_values, use_cache=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/transformers/internvl3_5/test_processor.py b/tests/transformers/internvl3_5/test_processor.py new file mode 100644 index 00000000000..42b95fcf748 --- /dev/null +++ b/tests/transformers/internvl3_5/test_processor.py @@ -0,0 +1,39 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import unittest + +from PIL import Image + +from paddleformers.transformers import InternVLImageProcessor, InternVLProcessor + + +class InternVLProcessorTest(unittest.TestCase): + def test_dynamic_image_preprocess(self): + image_processor = InternVLImageProcessor(size={"height": 28, "width": 28}, max_patches=2) + image = Image.new("RGB", (40, 20), (127, 64, 32)) + + outputs = image_processor(images=image, return_tensors="pd") + + self.assertEqual(outputs["num_patches_list"], [3]) + self.assertEqual(list(outputs["pixel_values"].shape), [3, 3, 28, 28]) + self.assertEqual(list(outputs["image_flags"].shape), [3, 1]) + + def test_expand_image_tokens(self): + processor = InternVLProcessor.__new__(InternVLProcessor) + processor.image_seq_length = 4 + processor.image_token = "" + processor.img_start_token = "" + processor.img_end_token = "" + processor.img_context_token = "" + + expanded = processor._expand_image_tokens(["\nDescribe."], [3]) + + self.assertEqual(expanded[0].count(""), 1) + self.assertEqual(expanded[0].count(""), 1) + self.assertEqual(expanded[0].count(""), 12) + + +if __name__ == "__main__": + unittest.main()