Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddleformers/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@
"paddleocr_vl.modeling": ["PaddleOCRVLForConditionalGeneration"],
"paddleocr_vl.image_processor": ["PaddleOCRVLImageProcessor"],
"paddleocr_vl.processor": ["PaddleOCRVLProcessor"],
"internvl3_5.configuration": ["InternVisionConfig", "InternVLChatConfig"],
"internvl3_5.modeling": ["InternVisionModel", "InternVLChatModel"],
"internvl3_5.image_processor": ["InternVLImageProcessor"],
"internvl3_5.processor": ["InternVLProcessor"],
"gpt_oss.configuration": ["GptOssConfig"],
"gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"],
"kimi_k25.vision_processor": ["KimiK25VisionProcessor"],
Expand Down
5 changes: 5 additions & 0 deletions paddleformers/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
("gemma3_text", "Gemma3TextConfig"),
("glm4v_moe", "Glm4vMoeConfig"),
("glm_ocr", "GlmOcrConfig"),
("internvl_chat", "InternVLChatConfig"),
("intern_vit_6b", "InternVisionConfig"),
("qwen3_5", "Qwen3_5Config"),
("qwen3_5_moe", "Qwen3_5MoEConfig"),
]
Expand Down Expand Up @@ -89,6 +91,7 @@
("qwen3_vl_moe", "Qwen3VLMoe"),
("qwen3_vl_moe_text", "Qwen3VLMoeText"),
("glm_ocr", "GlmOcrForConditionalGeneration"),
("internvl_chat", "InternVLChat"),
("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
("qwen3_5", "Qwen3_5ForConditionalGeneration"),
]
Expand All @@ -104,6 +107,8 @@
("qwen2_5_vl_text", "qwen2_5_vl"),
("qwen3_vl_text", "qwen3_vl"),
("qwen3_vl_moe_text", "qwen3_vl_moe"),
("internvl_chat", "internvl3_5"),
("intern_vit_6b", "internvl3_5"),
]
)

Expand Down
2 changes: 2 additions & 0 deletions paddleformers/transformers/auto/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
"qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
"qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"),
"glm_ocr": ("Glm46VImageProcessor"),
"internvl_chat": ("InternVLImageProcessor"),
"intern_vit_6b": ("InternVLImageProcessor"),
}
)

Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
("Gemma3", "gemma3_text"),
("Glm4vMoe", "glm4v_moe"),
("GlmOcr", "glm_ocr"),
("InternVLChat", "internvl3_5"),
]
)

Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
("ernie4_5_moe_vl", "Ernie4_5_VLProcessor"),
("glm4v_moe", "Glm4vProcessor"),
("glm_ocr", "Glm46VProcessor"),
("internvl_chat", "InternVLProcessor"),
]
)

Expand Down
28 changes: 28 additions & 0 deletions paddleformers/transformers/internvl3_5/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

import sys
from typing import TYPE_CHECKING

from ...utils.lazy_import import _LazyModule

import_structure = {
"configuration": ["InternVisionConfig", "InternVLChatConfig"],
"image_processor": ["InternVLImageProcessor"],
"modeling": ["InternVisionModel", "InternVLChatModel"],
"processor": ["InternVLProcessor"],
}

if TYPE_CHECKING:
from .configuration import *
from .image_processor import *
from .modeling import *
from .processor import *
else:
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
import_structure,
module_spec=__spec__,
)
119 changes: 119 additions & 0 deletions paddleformers/transformers/internvl3_5/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2024 OpenGVLab. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

import copy

from ..configuration_utils import PretrainedConfig
from ..qwen3.configuration import Qwen3Config

__all__ = ["InternVisionConfig", "InternVLChatConfig"]


class InternVisionConfig(PretrainedConfig):
model_type = "intern_vit_6b"
base_config_key = "vision_config"

def __init__(
self,
num_channels=3,
patch_size=14,
image_size=224,
qkv_bias=False,
hidden_size=3200,
num_attention_heads=25,
intermediate_size=12800,
qk_normalization=True,
num_hidden_layers=48,
use_flash_attn=True,
hidden_act="gelu",
norm_type="rms_norm",
layer_norm_eps=1e-6,
dropout=0.0,
drop_path_rate=0.0,
attention_dropout=0.0,
initializer_range=0.02,
initializer_factor=0.1,
**kwargs,
):
super().__init__(**kwargs)
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.dropout = dropout
self.drop_path_rate = drop_path_rate
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_channels = num_channels
self.patch_size = patch_size
self.image_size = image_size
self.initializer_range = initializer_range
self.initializer_factor = initializer_factor
self.attention_dropout = attention_dropout
self.layer_norm_eps = layer_norm_eps
self.hidden_act = hidden_act
self.norm_type = norm_type
self.qkv_bias = qkv_bias
self.qk_normalization = qk_normalization
self.use_flash_attn = use_flash_attn


class InternVLChatConfig(PretrainedConfig):
model_type = "internvl_chat"
is_composition = True
sub_configs = {"vision_config": InternVisionConfig, "llm_config": Qwen3Config}

def __init__(
self,
vision_config=None,
llm_config=None,
use_backbone_lora=0,
use_llm_lora=0,
select_layer=-1,
force_image_size=None,
downsample_ratio=0.5,
template=None,
dynamic_image_size=False,
use_thumbnail=False,
ps_version="v1",
min_dynamic_patch=1,
max_dynamic_patch=6,
img_context_token_id=151671,
**kwargs,
):
super().__init__(**kwargs)
if vision_config is None:
vision_config = {"architectures": ["InternVisionModel"]}
if llm_config is None:
llm_config = {"architectures": ["Qwen3ForCausalLM"]}

self.vision_config = (
InternVisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config
)
self.llm_config = Qwen3Config(**llm_config) if isinstance(llm_config, dict) else llm_config

self.use_backbone_lora = use_backbone_lora
self.use_llm_lora = use_llm_lora
self.select_layer = select_layer
self.force_image_size = force_image_size
self.downsample_ratio = downsample_ratio
self.template = template
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.ps_version = ps_version
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.img_context_token_id = img_context_token_id
self.tie_word_embeddings = self.llm_config.tie_word_embeddings
self.vocab_size = self.llm_config.vocab_size
self.hidden_size = self.llm_config.hidden_size
self.pad_token_id = getattr(self.llm_config, "pad_token_id", getattr(self, "pad_token_id", None))
self.eos_token_id = getattr(self.llm_config, "eos_token_id", getattr(self, "eos_token_id", None))
self.bos_token_id = getattr(self.llm_config, "bos_token_id", getattr(self, "bos_token_id", None))

def to_dict(self, *args, **kwargs):
output = copy.deepcopy(self.__dict__)
output["vision_config"] = self.vision_config.to_dict()
output["llm_config"] = self.llm_config.to_dict()
output["model_type"] = self.__class__.model_type
return output
164 changes: 164 additions & 0 deletions paddleformers/transformers/internvl3_5/image_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
# Copyright 2024 OpenGVLab. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

from typing import List, Optional, Union

import numpy as np
import paddle
from PIL import Image

from ..feature_extraction_utils import BatchFeature
from ..image_processing_utils import BaseImageProcessor
from ..image_utils import ImageInput, PILImageResampling, is_valid_image, to_numpy_array

__all__ = ["InternVLImageProcessor"]


IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]


def _to_pil_image(image):
if isinstance(image, Image.Image):
return image.convert("RGB")
array = to_numpy_array(image)
if array.ndim == 3 and array.shape[0] in [1, 3]:
array = np.transpose(array, (1, 2, 0))
if array.dtype != np.uint8:
if array.max() <= 1.0:
array = array * 255
array = array.astype("uint8")
return Image.fromarray(array).convert("RGB")


class InternVLImageProcessor(BaseImageProcessor):
model_input_names = ["pixel_values", "num_patches_list", "image_flags"]

def __init__(
self,
do_resize: bool = True,
size: Optional[dict] = None,
resample: int = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_patches: int = 1,
max_patches: int = 12,
use_thumbnail: bool = True,
**kwargs,
):
super().__init__(**kwargs)
self.do_resize = do_resize
self.size = size if size is not None else {"height": 448, "width": 448}
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else IMAGENET_MEAN
self.image_std = image_std if image_std is not None else IMAGENET_STD
self.do_convert_rgb = do_convert_rgb
self.min_patches = min_patches
self.max_patches = max_patches
self.use_thumbnail = use_thumbnail

@staticmethod
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float("inf")
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio

def dynamic_preprocess(self, image, min_num=None, max_num=None, image_size=None, use_thumbnail=None):
min_num = min_num if min_num is not None else self.min_patches
max_num = max_num if max_num is not None else self.max_patches
image_size = image_size if image_size is not None else self.size["height"]
use_thumbnail = use_thumbnail if use_thumbnail is not None else self.use_thumbnail

orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if min_num <= i * j <= max_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
target_aspect_ratio = self.find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
resized_img = image.resize((target_width, target_height), resample=self.resample)
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
processed_images.append(resized_img.crop(box))
if use_thumbnail and len(processed_images) != 1:
processed_images.append(image.resize((image_size, image_size), resample=self.resample))
return processed_images

def _preprocess_tile(self, image):
image_size = self.size["height"]
if self.do_resize:
image = image.resize((image_size, image_size), resample=self.resample)
array = np.asarray(image).astype("float32")
if self.do_rescale:
array = array * self.rescale_factor
if self.do_normalize:
mean = np.asarray(self.image_mean, dtype="float32")
std = np.asarray(self.image_std, dtype="float32")
array = (array - mean) / std
return np.transpose(array, (2, 0, 1))

def __call__(self, images: ImageInput = None, return_tensors=None, **kwargs):
if images is None:
return BatchFeature(data={})
if is_valid_image(images):
images = [images]
if not isinstance(images, (list, tuple)) or not all(is_valid_image(image) for image in images):
raise ValueError("InternVLImageProcessor expects an image or a list of images.")

pixel_values = []
num_patches_list = []
for image in images:
pil_image = _to_pil_image(image)
tiles = self.dynamic_preprocess(
pil_image,
min_num=kwargs.pop("min_patches", self.min_patches),
max_num=kwargs.pop("max_patches", self.max_patches),
image_size=kwargs.pop("image_size", self.size["height"]),
use_thumbnail=kwargs.pop("use_thumbnail", self.use_thumbnail),
)
num_patches_list.append(len(tiles))
pixel_values.extend([self._preprocess_tile(tile) for tile in tiles])

data = {
"pixel_values": np.stack(pixel_values).astype("float32"),
"num_patches_list": num_patches_list,
"image_flags": np.ones([len(pixel_values), 1], dtype="int64"),
}
if return_tensors == "pd":
data["pixel_values"] = paddle.to_tensor(data["pixel_values"])
data["image_flags"] = paddle.to_tensor(data["image_flags"])
return BatchFeature(data=data)
Loading