diff --git a/.github/workflows/model-unittest-gpu-ce-develop.yml b/.github/workflows/model-unittest-gpu-ce-develop.yml index 8bdf6fcd7ce..a8f4e4b595e 100644 --- a/.github/workflows/model-unittest-gpu-ce-develop.yml +++ b/.github/workflows/model-unittest-gpu-ce-develop.yml @@ -25,6 +25,7 @@ on: - 'qwen2' - 'gemma3_text' - 'paddleocr_vl' + - 'florence2' FLAGS_enable_CE: required: false default: 'CE_Develop_cu130_py312' @@ -455,4 +456,4 @@ jobs: echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY - echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY \ No newline at end of file + echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/model-unittest-gpu-ce-release.yml b/.github/workflows/model-unittest-gpu-ce-release.yml index 96f5d597efe..d3ff170cf84 100644 --- a/.github/workflows/model-unittest-gpu-ce-release.yml +++ b/.github/workflows/model-unittest-gpu-ce-release.yml @@ -30,6 +30,7 @@ on: - 'qwen2' - 'gemma3_text' - 'paddleocr_vl' + - 'florence2' FLAGS_enable_CE: required: false default: 'CE_Release_cu129_py312_nightly' @@ -483,4 +484,4 @@ jobs: echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY - echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY \ No newline at end of file + echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/model-unittest-gpu.yml b/.github/workflows/model-unittest-gpu.yml index a736c480446..4d1fdf281bb 100644 --- a/.github/workflows/model-unittest-gpu.yml +++ b/.github/workflows/model-unittest-gpu.yml @@ -23,6 +23,7 @@ on: - 'qwen2' - 'gemma3_text' - 'paddleocr_vl' + - 'florence2' - 'qwen2_moe' - 'qwen3_vl' - 'qwen3_vl_moe' @@ -488,4 +489,4 @@ jobs: else: res = gh("POST", f"{base}/issues/{pr_number}/comments", {"body": comment}) print(f"Created comment: {res.get('html_url')}") - PYEOF \ No newline at end of file + PYEOF diff --git a/examples/config/sft-vl/florence2_full_300_steps.yaml b/examples/config/sft-vl/florence2_full_300_steps.yaml new file mode 100644 index 00000000000..47325e580a2 --- /dev/null +++ b/examples/config/sft-vl/florence2_full_300_steps.yaml @@ -0,0 +1,48 @@ +### data +# JSONL example: +# {"messages":[{"role":"user","content":""},{"role":"assistant","content":"A cat."}], +# "images":["/path/to/image.jpg"]} +train_dataset_type: messages +eval_dataset_type: messages +train_dataset_path: ./florence2_train.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./florence2_train.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 1024 +packing: false +mix_strategy: concat +template_backend: custom +template: florence2 + +### model +model_name_or_path: /home/housaijie/code/Florence-2-base +continue_training: true + +### finetuning +stage: VL-SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: false +per_device_train_batch_size: 1 +max_steps: 300 +save_strategy: "no" +logging_steps: 1 +gradient_accumulation_steps: 1 +output_dir: ./checkpoints/florence2-sft-full +disable_tqdm: true + +### train +warmup_steps: 0 +learning_rate: 1.0e-5 + +### performance +tensor_model_parallel_size: 1 +pipeline_model_parallel_size: 1 +bf16: true +fp16: false +fp16_opt_level: O1 +convert_from_hf: true +unified_checkpoint: false +save_checkpoint_format: "flex_checkpoint" +load_checkpoint_format: "" diff --git a/paddleformers/datasets/template/mm_plugin.py b/paddleformers/datasets/template/mm_plugin.py index 3b5c124237f..89c83ddbd0e 100644 --- a/paddleformers/datasets/template/mm_plugin.py +++ b/paddleformers/datasets/template/mm_plugin.py @@ -392,6 +392,34 @@ def get_mm_inputs( return self._get_mm_inputs(images, videos, audios, processor, **kwargs) +@dataclass +class Florence2Plugin(BasePlugin): + @override + def process_messages( + self, + messages, + images, + videos, + audios, + mm_inputs, + processor, + ): + self._validate_input(processor, images, videos, audios) + self._validate_messages(messages, images, videos, audios) + if videos or audios: + raise ValueError("Florence-2 only supports image inputs.") + if len(images) != 1: + raise ValueError("Florence-2 supports exactly one image per prompt.") + + messages = deepcopy(messages) + for message in messages: + content = message["content"].replace(IMAGE_PLACEHOLDER, "").strip() + if message["role"] == "user" and hasattr(processor, "_construct_prompts"): + content = processor._construct_prompts([content])[0] + message["content"] = content + return messages + + @dataclass class PaddleOCRVLPlugin(BasePlugin): image_bos_token: str = "<|IMAGE_START|>" @@ -1496,6 +1524,7 @@ def process_messages( PLUGINS = { "base": BasePlugin, + "florence2": Florence2Plugin, "ernie_vl": ErnieVLPlugin, "qwen2_vl": Qwen2VLPlugin, "paddleocr_vl": PaddleOCRVLPlugin, diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py index 6d0d42cd39c..1abae2fd8f7 100644 --- a/paddleformers/datasets/template/template.py +++ b/paddleformers/datasets/template/template.py @@ -648,6 +648,16 @@ def get_template_and_fix_tokenizer(dataset_config) -> "Template": mm_plugin=get_mm_plugin(name="paddleocr_vl", image_token="<|IMAGE_PLACEHOLDER|>"), ) +register_template( + name="florence2", + format_user=StringFormatter(slots=["{{content}}"]), + format_assistant=StringFormatter(slots=["{{content}}"]), + format_system=StringFormatter(slots=["{{content}}"]), + format_prefix=EmptyFormatter(slots=[]), + suffix=[""], + mm_plugin=get_mm_plugin(name="florence2", image_token=""), +) + # copied from chatml template register_template( name="qwen", diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py index ae1c4f20c73..aff47efddf1 100644 --- a/paddleformers/transformers/__init__.py +++ b/paddleformers/transformers/__init__.py @@ -257,6 +257,14 @@ "qwen2_vl.processor": ["Qwen2VLProcessor"], "qwen2_vl.video_processor": ["Qwen2VLVideoProcessor"], "qwen2_vl.vision_process": ["process_vision_info"], + "florence2.configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"], + "florence2.image_processor": ["Florence2ImageProcessor"], + "florence2.modeling": [ + "Florence2ForConditionalGeneration", + "Florence2LanguageForConditionalGeneration", + "Florence2VisionModel", + ], + "florence2.processor": ["Florence2Processor"], "qwen3.configuration": ["Qwen3Config"], "qwen3.modeling": [ "Qwen3Model", @@ -290,6 +298,7 @@ "llama": [], "qwen2": [], "glm_ocr": [], + "florence2": [], "qwen3": [], "deepseek_v3": [], "ernie4_5": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5_ForCausalLM"], diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py index c04e1f34a5a..bb0a6385446 100644 --- a/paddleformers/transformers/auto/configuration.py +++ b/paddleformers/transformers/auto/configuration.py @@ -59,6 +59,7 @@ ("gpt_oss", "GptOssConfig"), ("phi3", "Phi3Config"), ("gemma3_text", "Gemma3TextConfig"), + ("florence2", "Florence2Config"), ("glm4v_moe", "Glm4vMoeConfig"), ("glm_ocr", "GlmOcrConfig"), ("qwen3_5", "Qwen3_5Config"), @@ -88,6 +89,7 @@ ("qwen3_vl_text", "Qwen3VL"), ("qwen3_vl_moe", "Qwen3VLMoe"), ("qwen3_vl_moe_text", "Qwen3VLMoeText"), + ("florence2", "Florence2ForConditionalGeneration"), ("glm_ocr", "GlmOcrForConditionalGeneration"), ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"), ("qwen3_5", "Qwen3_5ForConditionalGeneration"), diff --git a/paddleformers/transformers/auto/image_processing.py b/paddleformers/transformers/auto/image_processing.py index 4244259c0e1..3a8b137e85d 100644 --- a/paddleformers/transformers/auto/image_processing.py +++ b/paddleformers/transformers/auto/image_processing.py @@ -55,6 +55,7 @@ "glm4v_moe": ("Glm4vImageProcessor", "Glm4vImageProcessorFast"), "kimi_k25": ("KimiK25VisionProcessor"), "paddleocr_vl": ("PaddleOCRVLImageProcessor"), + "florence2": ("Florence2ImageProcessor"), "qwen2_5_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"), "qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"), "qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"), diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py index 11321baba1f..ed1bbb10bad 100644 --- a/paddleformers/transformers/auto/modeling.py +++ b/paddleformers/transformers/auto/modeling.py @@ -78,6 +78,7 @@ ("GptOss", "gpt_oss"), ("Phi3", "phi3"), ("Gemma3", "gemma3_text"), + ("Florence2", "florence2"), ("Glm4vMoe", "glm4v_moe"), ("GlmOcr", "glm_ocr"), ] diff --git a/paddleformers/transformers/auto/processing.py b/paddleformers/transformers/auto/processing.py index bca898e350d..d8e401c7232 100644 --- a/paddleformers/transformers/auto/processing.py +++ b/paddleformers/transformers/auto/processing.py @@ -54,6 +54,7 @@ ("qwen2_vl", "Qwen2VLProcessor"), ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"), ("paddleocr_vl", "PaddleOCRVLProcessor"), + ("florence2", "Florence2Processor"), ("ernie4_5_moe_vl", "Ernie4_5_VLProcessor"), ("glm4v_moe", "Glm4vProcessor"), ("glm_ocr", "Glm46VProcessor"), diff --git a/paddleformers/transformers/florence2/__init__.py b/paddleformers/transformers/florence2/__init__.py new file mode 100644 index 00000000000..48ad2e78dda --- /dev/null +++ b/paddleformers/transformers/florence2/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import sys +from typing import TYPE_CHECKING + +from ...utils.lazy_import import _LazyModule + +import_structure = { + "configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"], + "image_processor": ["Florence2ImageProcessor"], + "modeling": [ + "Florence2ForConditionalGeneration", + "Florence2LanguageForConditionalGeneration", + "Florence2VisionModel", + ], + "processor": ["Florence2Processor"], +} + +if TYPE_CHECKING: + from .configuration import * + from .image_processor import * + from .modeling import * + from .processor import * +else: + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], import_structure, module_spec=__spec__) + diff --git a/paddleformers/transformers/florence2/configuration.py b/paddleformers/transformers/florence2/configuration.py new file mode 100644 index 00000000000..4e983636b95 --- /dev/null +++ b/paddleformers/transformers/florence2/configuration.py @@ -0,0 +1,167 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..configuration_utils import PretrainedConfig + +__all__ = ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"] + + +class Florence2VisionConfig(PretrainedConfig): + model_type = "davit" + base_config_key = "vision_config" + + def __init__( + self, + drop_path_rate=0.1, + patch_size=(7, 3, 3, 3), + patch_stride=(4, 2, 2, 2), + patch_padding=(3, 1, 1, 1), + patch_prenorm=(False, True, True, True), + enable_checkpoint=False, + dim_embed=(128, 256, 512, 1024), + num_heads=(4, 8, 16, 32), + num_groups=(4, 8, 16, 32), + depths=(1, 1, 9, 1), + window_size=12, + projection_dim=768, + visual_temporal_embedding=None, + image_pos_embed=None, + image_feature_source=("spatial_avg_pool", "temporal_avg_pool"), + **kwargs, + ): + super().__init__(**kwargs) + self.drop_path_rate = drop_path_rate + self.patch_size = list(patch_size) + self.patch_stride = list(patch_stride) + self.patch_padding = list(patch_padding) + self.patch_prenorm = list(patch_prenorm) + self.enable_checkpoint = enable_checkpoint + self.dim_embed = list(dim_embed) + self.num_heads = list(num_heads) + self.num_groups = list(num_groups) + self.depths = list(depths) + self.window_size = window_size + self.projection_dim = projection_dim + self.visual_temporal_embedding = visual_temporal_embedding or { + "type": "COSINE", + "max_temporal_embeddings": 100, + } + self.image_pos_embed = image_pos_embed or {"type": "learned_abs_2d", "max_pos_embeddings": 50} + self.image_feature_source = list(image_feature_source) + + +class Florence2LanguageConfig(PretrainedConfig): + model_type = "florence2_language" + base_config_key = "text_config" + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=51289, + max_position_embeddings=1024, + encoder_layers=6, + encoder_ffn_dim=3072, + encoder_attention_heads=12, + decoder_layers=6, + decoder_ffn_dim=3072, + decoder_attention_heads=12, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + activation_function="gelu", + d_model=768, + dropout=0.1, + attention_dropout=0.1, + activation_dropout=0.1, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + use_cache=True, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + decoder_start_token_id=2, + forced_eos_token_id=2, + is_encoder_decoder=True, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.encoder_layers = encoder_layers + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_attention_heads = encoder_attention_heads + self.decoder_layers = decoder_layers + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_attention_heads = decoder_attention_heads + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.activation_function = activation_function + self.d_model = d_model + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.init_std = init_std + self.classifier_dropout = classifier_dropout + self.scale_embedding = scale_embedding + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + + +class Florence2Config(PretrainedConfig): + model_type = "florence2" + tokenizer_class = "BartTokenizer" + keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = {"vision_config": Florence2VisionConfig, "text_config": Florence2LanguageConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + ignore_index=-100, + vocab_size=51289, + projection_dim=768, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + is_encoder_decoder=True, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + self.vision_config = ( + Florence2VisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config + ) or Florence2VisionConfig() + self.text_config = ( + Florence2LanguageConfig(**text_config) if isinstance(text_config, dict) else text_config + ) or Florence2LanguageConfig() + self.ignore_index = ignore_index + self.vocab_size = vocab_size + self.projection_dim = projection_dim + self.tokenizer_class = "BartTokenizer" + self.decoder_start_token_id = self.text_config.decoder_start_token_id + self.forced_eos_token_id = self.text_config.forced_eos_token_id + self.use_cache = self.text_config.use_cache diff --git a/paddleformers/transformers/florence2/image_processor.py b/paddleformers/transformers/florence2/image_processor.py new file mode 100644 index 00000000000..99420932fdc --- /dev/null +++ b/paddleformers/transformers/florence2/image_processor.py @@ -0,0 +1,12 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import transformers as hf + +from ..image_processing_utils import warp_base_image_processor + +Florence2ImageProcessor = warp_base_image_processor(hf.CLIPImageProcessor) + +__all__ = ["Florence2ImageProcessor"] + diff --git a/paddleformers/transformers/florence2/modeling.py b/paddleformers/transformers/florence2/modeling.py new file mode 100644 index 00000000000..bfe4b537f42 --- /dev/null +++ b/paddleformers/transformers/florence2/modeling.py @@ -0,0 +1,939 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import math +from typing import Optional + +import paddle +import paddle.nn.functional as F +from paddle import nn + +from ...generation import GenerationMixin +from ..activations import ACT2FN +from ..model_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ..model_utils import PretrainedModel +from .configuration import Florence2Config, Florence2LanguageConfig, Florence2VisionConfig + +__all__ = [ + "Florence2ForConditionalGeneration", + "Florence2LanguageForConditionalGeneration", + "Florence2VisionModel", +] + + +def _expand_mask(mask, dtype, target_length=None): + target_length = target_length or mask.shape[-1] + expanded = mask[:, None, None, :].expand([mask.shape[0], 1, target_length, mask.shape[-1]]).astype(dtype) + return paddle.where(expanded > 0, paddle.zeros_like(expanded), paddle.full_like(expanded, paddle.finfo(dtype).min)) + + +def _causal_mask(batch_size, target_length, past_length, dtype): + rows = paddle.arange(target_length)[:, None] + past_length + cols = paddle.arange(target_length + past_length)[None, :] + allowed = cols <= rows + mask = paddle.where( + allowed, + paddle.zeros([target_length, target_length + past_length], dtype=dtype), + paddle.full([target_length, target_length + past_length], paddle.finfo(dtype).min, dtype=dtype), + ) + return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_length]) + + +def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id): + shifted = paddle.zeros_like(input_ids) + shifted[:, 1:] = input_ids[:, :-1].clone() + shifted[:, 0] = decoder_start_token_id + return paddle.where(shifted == -100, paddle.full_like(shifted, pad_token_id), shifted) + + +class DropPath(nn.Layer): + def __init__(self, drop_prob=0.0): + super().__init__() + self.drop_prob = drop_prob + + def forward(self, x): + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1.0 - self.drop_prob + shape = [x.shape[0]] + [1] * (x.ndim - 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) + return x / keep_prob * paddle.floor(random_tensor) + + +class LearnedAbsolutePositionEmbedding2D(nn.Layer): + def __init__(self, embedding_dim=256, num_pos=50): + super().__init__() + self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2) + self.column_embeddings = nn.Embedding(num_pos, embedding_dim - embedding_dim // 2) + + def forward(self, pixel_values): + height, width = pixel_values.shape[1:3] + x_emb = self.column_embeddings(paddle.arange(width)) + y_emb = self.row_embeddings(paddle.arange(height)) + pos = paddle.concat( + [ + x_emb.unsqueeze(0).tile([height, 1, 1]), + y_emb.unsqueeze(1).tile([1, width, 1]), + ], + axis=-1, + ) + return pos.unsqueeze(0).tile([pixel_values.shape[0], 1, 1, 1]) + + +class PositionalEmbeddingCosine1D(nn.Layer): + def __init__(self, embed_dim=512, max_seq_len=1024): + super().__init__() + denominator = paddle.exp(-math.log(10000) * paddle.arange(0, embed_dim, 2, dtype="float32") / embed_dim) + frequencies = paddle.arange(max_seq_len, dtype="float32").reshape([max_seq_len, 1]) * denominator + values = paddle.zeros([max_seq_len, embed_dim]) + values[:, 0::2] = paddle.sin(frequencies) + values[:, 1::2] = paddle.cos(frequencies) + self.register_buffer("pos_idx_to_embed", values, persistable=True) + + def forward(self, seq_embeds): + values = self.pos_idx_to_embed[: seq_embeds.shape[-2]] + return values.unsqueeze(0) if seq_embeds.ndim == 3 else values + + +class PreNorm(nn.Layer): + def __init__(self, norm, fn, drop_path=None): + super().__init__() + self.norm = norm + self.fn = fn + self.drop_path = drop_path + + def forward(self, x, *args): + shortcut = x + x, size = self.fn(self.norm(x) if self.norm is not None else x, *args) + if self.drop_path is not None: + x = self.drop_path(x) + return shortcut + x, size + + +class MlpNet(nn.Layer): + def __init__(self, in_features, hidden_features): + super().__init__() + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = nn.GELU() + self.fc2 = nn.Linear(hidden_features, in_features) + + def forward(self, x): + return self.fc2(self.act(self.fc1(x))) + + +class Mlp(nn.Layer): + def __init__(self, in_features, hidden_features): + super().__init__() + self.net = MlpNet(in_features, hidden_features) + + def forward(self, x, size): + return self.net(x), size + + +class DepthWiseConv2d(nn.Layer): + def __init__(self, dim_in, kernel_size, padding, stride): + super().__init__() + self.dw = nn.Conv2D(dim_in, dim_in, kernel_size, stride=stride, padding=padding, groups=dim_in) + + def forward(self, x, size): + batch_size, _, channels = x.shape + height, width = size + x = self.dw(x.transpose([0, 2, 1]).reshape([batch_size, channels, height, width])) + size = (x.shape[-2], x.shape[-1]) + return x.flatten(2).transpose([0, 2, 1]), size + + +class ConvEmbed(nn.Layer): + def __init__(self, patch_size, in_chans, embed_dim, stride, padding, pre_norm): + super().__init__() + self.proj = nn.Conv2D(in_chans, embed_dim, patch_size, stride=stride, padding=padding) + self.norm = nn.LayerNorm(in_chans if pre_norm else embed_dim) + self.pre_norm = pre_norm + + def forward(self, x, size): + height, width = size + if x.ndim == 3: + if self.pre_norm: + x = self.norm(x) + x = x.reshape([x.shape[0], height, width, x.shape[-1]]).transpose([0, 3, 1, 2]) + x = self.proj(x) + height, width = x.shape[-2:] + x = x.flatten(2).transpose([0, 2, 1]) + if not self.pre_norm: + x = self.norm(x) + return x, (height, width) + + +class ChannelAttention(nn.Layer): + def __init__(self, dim, groups): + super().__init__() + self.groups = groups + self.qkv = nn.Linear(dim, dim * 3) + self.proj = nn.Linear(dim, dim) + + def forward(self, x, size): + batch_size, num_tokens, channels = x.shape + qkv = self.qkv(x).reshape([batch_size, num_tokens, 3, self.groups, channels // self.groups]) + q, k, v = paddle.unbind(qkv.transpose([2, 0, 3, 1, 4]), axis=0) + attention = paddle.matmul( + (q * (float(num_tokens) ** -0.5)).transpose([0, 1, 3, 2]), + k, + ) + attention = F.softmax(attention, axis=-1) + x = paddle.matmul(attention, v.transpose([0, 1, 3, 2])).transpose([0, 1, 3, 2]) + x = self.proj(x.transpose([0, 2, 1, 3]).reshape([batch_size, num_tokens, channels])) + return x, size + + +def window_partition(x, window_size): + batch_size, height, width, channels = x.shape + x = x.reshape([batch_size, height // window_size, window_size, width // window_size, window_size, channels]) + return x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, channels]) + + +def window_reverse(windows, batch_size, window_size, height, width): + x = windows.reshape( + [batch_size, height // window_size, width // window_size, window_size, window_size, windows.shape[-1]] + ) + return x.transpose([0, 1, 3, 2, 4, 5]).reshape([batch_size, height, width, windows.shape[-1]]) + + +class WindowAttention(nn.Layer): + def __init__(self, dim, num_heads, window_size): + super().__init__() + self.window_size = window_size + self.num_heads = num_heads + self.scale = float(dim // num_heads) ** -0.5 + self.qkv = nn.Linear(dim, dim * 3) + self.proj = nn.Linear(dim, dim) + + def forward(self, x, size): + height, width = size + batch_size, _, channels = x.shape + x = x.reshape([batch_size, height, width, channels]) + pad_right = (self.window_size - width % self.window_size) % self.window_size + pad_bottom = (self.window_size - height % self.window_size) % self.window_size + if pad_right or pad_bottom: + x = F.pad(x.transpose([0, 3, 1, 2]), [0, pad_right, 0, pad_bottom]).transpose([0, 2, 3, 1]) + padded_height, padded_width = x.shape[1:3] + x = window_partition(x, self.window_size).reshape([-1, self.window_size**2, channels]) + qkv = self.qkv(x).reshape([-1, self.window_size**2, 3, self.num_heads, channels // self.num_heads]) + q, k, v = paddle.unbind(qkv.transpose([2, 0, 3, 1, 4]), axis=0) + attention = F.softmax(paddle.matmul(q * self.scale, k.transpose([0, 1, 3, 2])), axis=-1) + x = paddle.matmul(attention, v).transpose([0, 2, 1, 3]).reshape([-1, self.window_size**2, channels]) + x = self.proj(x).reshape([-1, self.window_size, self.window_size, channels]) + x = window_reverse(x, batch_size, self.window_size, padded_height, padded_width) + return x[:, :height, :width].reshape([batch_size, height * width, channels]), size + + +class SpatialBlock(nn.Layer): + def __init__(self, dim, num_heads, window_size, drop_path_rate): + super().__init__() + drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else None + self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) + self.window_attn = PreNorm(nn.LayerNorm(dim), WindowAttention(dim, num_heads, window_size), drop_path) + self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) + self.ffn = PreNorm(nn.LayerNorm(dim), Mlp(dim, dim * 4), drop_path) + + def forward(self, x, size): + x, size = self.conv1(x, size) + x, size = self.window_attn(x, size) + x, size = self.conv2(x, size) + return self.ffn(x, size) + + +class ChannelBlock(nn.Layer): + def __init__(self, dim, groups, drop_path_rate): + super().__init__() + drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else None + self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) + self.channel_attn = PreNorm(nn.LayerNorm(dim), ChannelAttention(dim, groups), drop_path) + self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) + self.ffn = PreNorm(nn.LayerNorm(dim), Mlp(dim, dim * 4), drop_path) + + def forward(self, x, size): + x, size = self.conv1(x, size) + x, size = self.channel_attn(x, size) + x, size = self.conv2(x, size) + return self.ffn(x, size) + + +class DaViTBlock(nn.Layer): + def __init__(self, dim, num_heads, groups, window_size, spatial_drop, channel_drop): + super().__init__() + self.spatial_block = SpatialBlock(dim, num_heads, window_size, spatial_drop) + self.channel_block = ChannelBlock(dim, groups, channel_drop) + + def forward(self, x, size): + x, size = self.spatial_block(x, size) + return self.channel_block(x, size) + + +class DaViT(nn.Layer): + def __init__(self, config): + super().__init__() + dpr = paddle.linspace(0, config.drop_path_rate, sum(config.depths) * 2).tolist() + self.convs = nn.LayerList() + self.blocks = nn.LayerList() + offset = 0 + for index, depth in enumerate(config.depths): + self.convs.append( + ConvEmbed( + config.patch_size[index], + 3 if index == 0 else config.dim_embed[index - 1], + config.dim_embed[index], + config.patch_stride[index], + config.patch_padding[index], + config.patch_prenorm[index], + ) + ) + self.blocks.append( + nn.LayerList( + [ + DaViTBlock( + config.dim_embed[index], + config.num_heads[index], + config.num_groups[index], + config.window_size, + dpr[offset + layer * 2], + dpr[offset + layer * 2 + 1], + ) + for layer in range(depth) + ] + ) + ) + offset += depth * 2 + + def forward_features_unpool(self, x): + size = (x.shape[2], x.shape[3]) + for conv, blocks in zip(self.convs, self.blocks): + x, size = conv(x, size) + for block in blocks: + x, size = block(x, size) + return x + + +class Florence2Attention(nn.Layer): + def __init__(self, embed_dim, num_heads, dropout=0.0, is_decoder=False): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.out_proj = nn.Linear(embed_dim, embed_dim) + + def _shape(self, tensor, seq_len, batch_size): + return tensor.reshape([batch_size, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states, + key_value_states=None, + past_key_value=None, + attention_mask=None, + layer_head_mask=None, + output_attentions=False, + ): + is_cross_attention = key_value_states is not None + batch_size, target_length, _ = hidden_states.shape + query_states = self._shape(self.q_proj(hidden_states) * self.scaling, target_length, batch_size) + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + key_states, value_states = past_key_value + elif is_cross_attention: + key_states = self._shape(self.k_proj(key_value_states), key_value_states.shape[1], batch_size) + value_states = self._shape(self.v_proj(key_value_states), key_value_states.shape[1], batch_size) + else: + key_states = self._shape(self.k_proj(hidden_states), target_length, batch_size) + value_states = self._shape(self.v_proj(hidden_states), target_length, batch_size) + if past_key_value is not None: + key_states = paddle.concat([past_key_value[0], key_states], axis=2) + value_states = paddle.concat([past_key_value[1], value_states], axis=2) + present = (key_states, value_states) if self.is_decoder else None + attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + attn_weights = F.softmax(attn_weights, axis=-1) + if layer_head_mask is not None: + attn_weights = attn_weights * layer_head_mask.reshape([1, -1, 1, 1]) + attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + output = paddle.matmul(attn_probs, value_states).transpose([0, 2, 1, 3]) + output = self.out_proj(output.reshape([batch_size, target_length, self.embed_dim])) + return output, attn_weights if output_attentions else None, present + + +class Florence2EncoderLayer(nn.Layer): + def __init__(self, config): + super().__init__() + self.self_attn = Florence2Attention(config.d_model, config.encoder_attention_heads, config.attention_dropout) + self.self_attn_layer_norm = nn.LayerNorm(config.d_model) + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, config.d_model) + self.final_layer_norm = nn.LayerNorm(config.d_model) + self.activation_fn = ACT2FN[config.activation_function] + self.dropout = config.dropout + self.activation_dropout = config.activation_dropout + + def forward(self, hidden_states, attention_mask=None, layer_head_mask=None, output_attentions=False): + residual = hidden_states + hidden_states, weights, _ = self.self_attn( + hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_layer_norm( + residual + F.dropout(hidden_states, p=self.dropout, training=self.training) + ) + residual = hidden_states + hidden_states = self.fc2( + F.dropout(self.activation_fn(self.fc1(hidden_states)), p=self.activation_dropout, training=self.training) + ) + hidden_states = self.final_layer_norm( + residual + F.dropout(hidden_states, p=self.dropout, training=self.training) + ) + return (hidden_states, weights) if output_attentions else (hidden_states,) + + +class Florence2DecoderLayer(nn.Layer): + def __init__(self, config): + super().__init__() + self.self_attn = Florence2Attention( + config.d_model, config.decoder_attention_heads, config.attention_dropout, is_decoder=True + ) + self.self_attn_layer_norm = nn.LayerNorm(config.d_model) + self.encoder_attn = Florence2Attention( + config.d_model, config.decoder_attention_heads, config.attention_dropout, is_decoder=True + ) + self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) + self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model) + self.final_layer_norm = nn.LayerNorm(config.d_model) + self.activation_fn = ACT2FN[config.activation_function] + self.dropout = config.dropout + self.activation_dropout = config.activation_dropout + + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + layer_head_mask=None, + cross_attn_layer_head_mask=None, + past_key_value=None, + output_attentions=False, + use_cache=True, + ): + residual = hidden_states + hidden_states, self_weights, present = self.self_attn( + hidden_states, + past_key_value=past_key_value[:2] if past_key_value is not None else None, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = self.self_attn_layer_norm( + residual + F.dropout(hidden_states, p=self.dropout, training=self.training) + ) + cross_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states, cross_weights, cross_present = self.encoder_attn( + hidden_states, + key_value_states=encoder_hidden_states, + past_key_value=past_key_value[-2:] if past_key_value is not None else None, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = self.encoder_attn_layer_norm( + residual + F.dropout(hidden_states, p=self.dropout, training=self.training) + ) + present = present + cross_present + residual = hidden_states + hidden_states = self.fc2( + F.dropout(self.activation_fn(self.fc1(hidden_states)), p=self.activation_dropout, training=self.training) + ) + hidden_states = self.final_layer_norm( + residual + F.dropout(hidden_states, p=self.dropout, training=self.training) + ) + outputs = (hidden_states,) + if output_attentions: + outputs += (self_weights, cross_weights) + if use_cache: + outputs += (present,) + return outputs + + +class Florence2LearnedPositionalEmbedding(nn.Embedding): + def __init__(self, num_embeddings, embedding_dim): + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, input_ids, past_key_values_length=0): + positions = paddle.arange(past_key_values_length, past_key_values_length + input_ids.shape[1], dtype="int64") + return super().forward(positions.unsqueeze(0).expand([input_ids.shape[0], -1]) + self.offset) + + +class Florence2Encoder(nn.Layer): + def __init__(self, config, embed_tokens): + super().__init__() + self.config = config + self.embed_tokens = embed_tokens + self.embed_positions = Florence2LearnedPositionalEmbedding(config.max_position_embeddings, config.d_model) + self.layers = nn.LayerList([Florence2EncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.dropout = config.dropout + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + **kwargs, + ): + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + position_source = input_ids if input_ids is not None else paddle.zeros(inputs_embeds.shape[:2], dtype="int64") + hidden_states = self.layernorm_embedding(inputs_embeds + self.embed_positions(position_source)) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + expanded_mask = _expand_mask(attention_mask, hidden_states.dtype) if attention_mask is not None else None + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + for index, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + outputs = layer( + hidden_states, + expanded_mask, + head_mask[index] if head_mask is not None else None, + output_attentions, + ) + hidden_states = outputs[0] + if output_attentions: + all_attentions += (outputs[1],) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if not return_dict: + return tuple(x for x in [hidden_states, all_hidden_states, all_attentions] if x is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + ) + + +class Florence2Decoder(nn.Layer): + def __init__(self, config, embed_tokens): + super().__init__() + self.config = config + self.embed_tokens = embed_tokens + self.embed_positions = Florence2LearnedPositionalEmbedding(config.max_position_embeddings, config.d_model) + self.layers = nn.LayerList([Florence2DecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.dropout = config.dropout + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=True, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + **kwargs, + ): + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + position_source = input_ids if input_ids is not None else paddle.zeros(inputs_embeds.shape[:2], dtype="int64") + hidden_states = self.layernorm_embedding(inputs_embeds + self.embed_positions(position_source, past_length)) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + causal_mask = _causal_mask(hidden_states.shape[0], hidden_states.shape[1], past_length, hidden_states.dtype) + if attention_mask is not None: + causal_mask = causal_mask + _expand_mask(attention_mask, hidden_states.dtype, hidden_states.shape[1]) + encoder_mask = ( + _expand_mask(encoder_attention_mask, hidden_states.dtype, hidden_states.shape[1]) + if encoder_attention_mask is not None + else None + ) + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + next_cache = () if use_cache else None + for index, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + outputs = layer( + hidden_states, + causal_mask, + encoder_hidden_states, + encoder_mask, + head_mask[index] if head_mask is not None else None, + cross_attn_head_mask[index] if cross_attn_head_mask is not None else None, + past_key_values[index] if past_key_values is not None else None, + output_attentions, + use_cache, + ) + hidden_states = outputs[0] + if use_cache: + next_cache += (outputs[3 if output_attentions else 1],) + if output_attentions: + all_self_attentions += (outputs[1],) + all_cross_attentions += (outputs[2],) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if not return_dict: + return tuple( + x + for x in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions] + if x is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Florence2LanguageModel(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.shared = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id) + self.encoder = Florence2Encoder(config, self.shared) + self.decoder = Florence2Decoder(config, self.shared) + + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_output=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + **kwargs, + ): + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + input_ids, + self.config.pad_token_id, + self.config.decoder_start_token_id, + ) + encoder_outputs = encoder_outputs if encoder_outputs is not None else encoder_output + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + encoder_hidden_states = ( + encoder_outputs.last_hidden_state if hasattr(encoder_outputs, "last_hidden_state") else encoder_outputs[0] + ) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + if not return_dict: + return decoder_outputs + encoder_outputs + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_hidden_states, + encoder_hidden_states=getattr(encoder_outputs, "hidden_states", None), + encoder_attentions=getattr(encoder_outputs, "attentions", None), + ) + + +class Florence2LanguagePretrainedModel(PretrainedModel): + config_class = Florence2LanguageConfig + base_model_prefix = "model" + transpose_weight_keys = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"] + + +class Florence2LanguageForConditionalGeneration(Florence2LanguagePretrainedModel, GenerationMixin): + def __init__(self, config): + super().__init__(config) + self.is_encoder_decoder = True + self.model = Florence2LanguageModel(config) + self.register_buffer("final_logits_bias", paddle.zeros([1, config.vocab_size]), persistable=True) + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + def get_input_embeddings(self): + return self.model.shared + + def set_input_embeddings(self, value): + self.model.shared = value + self.model.encoder.embed_tokens = value + self.model.decoder.embed_tokens = value + self.config.vocab_size = value.weight.shape[0] + self.final_logits_bias = paddle.zeros([1, value.weight.shape[0]], dtype=value.weight.dtype) + + def forward(self, labels=None, return_dict=True, **kwargs): + if labels is not None and kwargs.get("decoder_input_ids") is None: + kwargs["decoder_input_ids"] = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + outputs = self.model(return_dict=return_dict, **kwargs) + hidden_states = outputs.last_hidden_state if return_dict else outputs[0] + logits = paddle.matmul(hidden_states, self.model.shared.weight, transpose_y=True) + self.final_logits_bias + loss = None + if labels is not None: + loss = F.cross_entropy( + logits.reshape([-1, self.config.vocab_size]), + labels.reshape([-1]), + ignore_index=-100, + ) + if not return_dict: + return ((loss, logits) if loss is not None else (logits,)) + outputs[1:] + return Seq2SeqLMOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + if past_key_values is not None: + input_ids = input_ids[:, -1:] + return { + "decoder_input_ids": input_ids, + "past_key_values": past_key_values, + "encoder_output": kwargs.get("encoder_output"), + "attention_mask": kwargs.get("attention_mask"), + "use_cache": kwargs.get("use_cache", True), + "return_dict": True, + } + + def _reorder_cache(self, past_key_values, beam_idx): + return tuple( + tuple(paddle.index_select(state, beam_idx, axis=0) for state in layer) for layer in past_key_values + ) + + +class Florence2PretrainedModel(PretrainedModel): + config_class = Florence2Config + base_model_prefix = "" + _keys_to_ignore_on_load_missing = [ + r"language_model.model.encoder.embed_tokens.weight", + r"language_model.model.decoder.embed_tokens.weight", + ] + transpose_weight_keys = [ + "qkv", + "proj", + "q_proj", + "k_proj", + "v_proj", + "out_proj", + "fc1", + "fc2", + ] + + +class Florence2VisionModel(Florence2PretrainedModel): + main_input_name = "pixel_values" + + def __init__(self, config: Florence2VisionConfig): + super().__init__(config) + self.vision_tower = DaViT(config) + + def forward(self, pixel_values): + return self.vision_tower.forward_features_unpool(pixel_values) + + +class Florence2ForConditionalGeneration(Florence2PretrainedModel, GenerationMixin): + def __init__(self, config): + super().__init__(config) + self.is_encoder_decoder = True + self.vision_tower = DaViT(config.vision_config) + image_dim = config.vision_config.dim_embed[-1] + projection_dim = config.vision_config.projection_dim + self.image_projection = self.create_parameter([image_dim, projection_dim]) + self.image_proj_norm = nn.LayerNorm(projection_dim) + self.image_pos_embed = LearnedAbsolutePositionEmbedding2D( + image_dim, config.vision_config.image_pos_embed["max_pos_embeddings"] + ) + self.visual_temporal_embed = PositionalEmbeddingCosine1D( + image_dim, config.vision_config.visual_temporal_embedding["max_temporal_embeddings"] + ) + self.image_feature_source = config.vision_config.image_feature_source + self.language_model = Florence2LanguageForConditionalGeneration(config.text_config) + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + self.config.vocab_size = value.weight.shape[0] + self.config.text_config.vocab_size = value.weight.shape[0] + + def _encode_image(self, pixel_values): + batch_size = pixel_values.shape[0] + x = self.vision_tower.forward_features_unpool(pixel_values) + num_tokens = x.shape[1] + height = width = int(num_tokens**0.5) + x = x.reshape([batch_size, height, width, x.shape[-1]]) + x = x + self.image_pos_embed(x) + x = x.reshape([batch_size, 1, height * width, x.shape[-1]]) + x = x + self.visual_temporal_embed(x[:, :, 0]).reshape([1, 1, 1, x.shape[-1]]) + features = { + "spatial_avg_pool": x.mean(axis=2), + "temporal_avg_pool": x.mean(axis=1), + "last_frame": x[:, -1], + } + x = paddle.concat([features[source] for source in self.image_feature_source], axis=1) + return self.image_proj_norm(paddle.matmul(x, self.image_projection)) + + def _merge_image_features(self, image_features, inputs_embeds, attention_mask=None): + image_mask = paddle.ones(image_features.shape[:2], dtype=inputs_embeds.dtype) + text_mask = ( + attention_mask.astype(inputs_embeds.dtype) + if attention_mask is not None + else paddle.ones(inputs_embeds.shape[:2], dtype=inputs_embeds.dtype) + ) + return paddle.concat([image_features, inputs_embeds], axis=1), paddle.concat([image_mask, text_mask], axis=1) + + def _split_sft_inputs(self, input_ids, labels, attention_mask): + source_rows, label_rows = [], [] + max_source = 1 + max_target = 1 + for row, label_row in zip(input_ids.tolist(), labels.tolist()): + target_start = next((index for index, value in enumerate(label_row) if value != -100), len(row)) + # PaddleFormers SFT labels are shifted left once, so the first + # supervised label predicts the token after this source position. + source = row[: target_start + 1] or [self.config.bos_token_id] + target = [value for value in label_row[target_start:] if value != -100] + source_rows.append(source) + label_rows.append(target or [self.config.eos_token_id]) + max_source = max(max_source, len(source)) + max_target = max(max_target, len(target)) + source_ids = paddle.full([len(source_rows), max_source], self.config.pad_token_id, dtype=input_ids.dtype) + source_mask = paddle.zeros([len(source_rows), max_source], dtype="int64") + decoder_labels = paddle.full([len(label_rows), max_target], -100, dtype=labels.dtype) + for index, (source, target) in enumerate(zip(source_rows, label_rows)): + source_ids[index, : len(source)] = paddle.to_tensor(source, dtype=input_ids.dtype) + source_mask[index, : len(source)] = 1 + decoder_labels[index, : len(target)] = paddle.to_tensor(target, dtype=labels.dtype) + return source_ids, decoder_labels, source_mask + + def forward( + self, + input_ids=None, + pixel_values=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + encoder_output=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + **kwargs, + ): + if labels is not None and input_ids is not None and labels.shape == input_ids.shape: + input_ids, labels, attention_mask = self._split_sft_inputs(input_ids, labels, attention_mask) + image_features = None + if encoder_output is None and encoder_outputs is None and inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + if pixel_values is not None: + image_features = self._encode_image(pixel_values) + inputs_embeds, attention_mask = self._merge_image_features( + image_features, + inputs_embeds, + attention_mask, + ) + outputs = self.language_model( + input_ids=None if inputs_embeds is not None else input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + encoder_output=encoder_output if encoder_output is not None else encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + return outputs + + def generate(self, input_ids=None, pixel_values=None, inputs_embeds=None, attention_mask=None, **kwargs): + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + if pixel_values is not None: + inputs_embeds, attention_mask = self._merge_image_features( + self._encode_image(pixel_values), inputs_embeds, attention_mask + ) + return self.language_model.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs) diff --git a/paddleformers/transformers/florence2/processor.py b/paddleformers/transformers/florence2/processor.py new file mode 100644 index 00000000000..907455c7c35 --- /dev/null +++ b/paddleformers/transformers/florence2/processor.py @@ -0,0 +1,219 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import re + +from ..image_processing_utils import BatchFeature +from ..image_utils import ImageInput +from ..processing_utils import ProcessorMixin +from ..tokenizer_utils_base import PreTokenizedInput, TextInput + +__all__ = ["Florence2Processor"] + + +class Florence2Processor(ProcessorMixin): + attributes = ["image_processor", "tokenizer"] + image_processor_class = "AutoImageProcessor" + tokenizer_class = ("BartTokenizer", "BartTokenizerFast") + + @classmethod + def _load_tokenizer_from_pretrained( + cls, + sub_processor_type, + pretrained_model_name_or_path, + subfolder="", + **kwargs, + ): + kwargs.setdefault("tokenizer_type", "bart") + return super()._load_tokenizer_from_pretrained( + sub_processor_type, pretrained_model_name_or_path, subfolder=subfolder, **kwargs + ) + + def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): + if image_processor is None or tokenizer is None: + raise ValueError("Florence2Processor requires both an image processor and a tokenizer.") + + tokens = ( + ["", "", "", ""] + + [f"" for index in range(1000)] + + [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + ] + ) + tokenizer.add_special_tokens( + {"additional_special_tokens": list(getattr(tokenizer, "additional_special_tokens", [])) + tokens} + ) + self.image_seq_length = getattr(image_processor, "image_seq_length", 577) + self.task_prompts_without_inputs = { + "": "What is the text in the image?", + "": "What is the text in the image, with regions?", + "": "What does the image describe?", + "": "Describe in detail what is shown in the image.", + "": "Describe with a paragraph what is shown in the image.", + "": "Locate the objects with category name in the image.", + "": "Locate the objects in the image, with their descriptions.", + "": "Locate the region proposals in the image.", + } + self.task_prompts_with_input = { + "": "Locate the phrases in the caption: {input}", + "": "Locate {input} in the image with mask", + "": "What is the polygon mask of region {input}", + "": "Locate {input} in the image.", + "": "What is the region {input}?", + "": "What does the region {input} describe?", + "": "What text is in the region {input}?", + } + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def _construct_prompts(self, texts): + prompts = [] + for text in texts: + for task, prompt in self.task_prompts_without_inputs.items(): + if task in text: + if text != task: + raise ValueError(f"Task token {task} must be the only token in the prompt.") + text = prompt + break + for task, prompt in self.task_prompts_with_input.items(): + if task in text: + text = prompt.format(input=text.replace(task, "")) + break + prompts.append(text) + return prompts + + def __call__( + self, + images: ImageInput = None, + text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, + return_tensors="pd", + padding=False, + truncation=None, + max_length=None, + **kwargs, + ): + if images is None: + raise ValueError("`images` must be provided to Florence2Processor.") + texts = text if isinstance(text, list) else [text or ""] + if isinstance(images, list) and len(images) < len(texts): + raise ValueError("Each Florence-2 prompt must have an associated image.") + + image_kwargs = { + key: value + for key, value in kwargs.items() + if key + in { + "do_resize", + "do_normalize", + "image_mean", + "image_std", + "data_format", + "input_data_format", + "resample", + "do_convert_rgb", + "do_rescale", + } + and value is not None + } + image_inputs = self.image_processor(images=images, return_tensors=return_tensors, **image_kwargs) + if max_length is not None: + max_length -= self.image_seq_length + text_inputs = self.tokenizer( + self._construct_prompts(texts), + return_tensors=return_tensors, + padding=padding, + truncation=truncation, + max_length=max_length, + return_token_type_ids=False, + ) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + return list(dict.fromkeys(self.tokenizer.model_input_names + self.image_processor.model_input_names)) + + @staticmethod + def _dequantize(values, image_size): + width, height = image_size + return [ + (value + 0.5) * (width if index % 2 == 0 else height) / 1000 + for index, value in enumerate(values) + ] + + def post_process_generation(self, text, task, image_size): + clean_text = text.replace("", "").replace("", "").replace("", "") + if task in { + "", + "", + "", + "", + "", + "", + "", + }: + return {task: clean_text} + if task == "": + pattern = r"(.+?)" + "".join([r""] * 8) + matches = re.findall(pattern, clean_text) + return { + task: { + "quad_boxes": [ + self._dequantize([int(value) for value in match[1:]], image_size) for match in matches + ], + "labels": [match[0] for match in matches], + } + } + if task in {"", ""}: + polygons, labels = [], [] + pattern = r"([^<]*)(?:)?((?:|)+)(?:)?" + for phrase, encoded_polygons in re.findall(pattern, clean_text): + instance = [] + for encoded_polygon in encoded_polygons.split(""): + values = [int(value) for value in re.findall(r"", encoded_polygon)] + if len(values) >= 6 and len(values) % 2 == 0: + instance.append(self._dequantize(values, image_size)) + if instance: + polygons.append(instance) + labels.append(phrase.strip()) + return {task: {"polygons": polygons, "labels": labels}} + if task == "": + values = [int(value) for value in re.findall(r"", clean_text)] + bboxes = [ + self._dequantize(values[index : index + 4], image_size) + for index in range(0, len(values) - 3, 4) + ] + return {task: {"bboxes": bboxes, "labels": [""] * len(bboxes)}} + + phrase_pattern = r"([^<]+)((?:){4,})" + box_pattern = r"" + bboxes, labels = [], [] + for phrase, encoded_boxes in re.findall(phrase_pattern, clean_text): + for box in re.findall(box_pattern, encoded_boxes): + bboxes.append(self._dequantize([int(value) for value in box], image_size)) + labels.append(phrase.strip()) + return {task: {"bboxes": bboxes, "labels": labels}} diff --git a/tests/transformers/florence2/__init__.py b/tests/transformers/florence2/__init__.py new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/tests/transformers/florence2/__init__.py @@ -0,0 +1 @@ + diff --git a/tests/transformers/florence2/test_modeling.py b/tests/transformers/florence2/test_modeling.py new file mode 100644 index 00000000000..3ba1138cdc5 --- /dev/null +++ b/tests/transformers/florence2/test_modeling.py @@ -0,0 +1,327 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import tempfile +import unittest + +import numpy as np +import paddle + +from paddleformers.generation import BeamSearchScorer, LogitsProcessorList +from paddleformers.transformers import Florence2Config, Florence2ForConditionalGeneration +from paddleformers.transformers.model_outputs import BaseModelOutput +from tests.transformers.test_configuration_common import ConfigTester +from tests.transformers.test_generation_utils import GenerationTesterMixin +from tests.transformers.test_modeling_common import ( + ModelTesterMixin, + ModelTesterPretrainedMixin, + floats_tensor, + ids_tensor, +) + + +class Florence2ModelTester: + def __init__( + self, + parent, + batch_size=2, + seq_length=5, + decoder_seq_length=4, + image_size=32, + vocab_size=100, + hidden_size=32, + encoder_layers=2, + decoder_layers=2, + num_attention_heads=4, + is_training=False, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.decoder_seq_length = decoder_seq_length + self.image_size = image_size + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.encoder_layers = encoder_layers + self.decoder_layers = decoder_layers + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = encoder_layers + self.expected_num_hidden_layers = encoder_layers + 1 + self.is_training = is_training + + self.image_feature_length = 2 + self.encoder_seq_length = seq_length + self.image_feature_length + self.decoder_key_length = decoder_seq_length + + def get_config(self): + return Florence2Config( + vision_config={ + "depths": [1, 1, 1, 1], + "dim_embed": [16, 32, 64, 128], + "num_heads": [2, 4, 8, 16], + "num_groups": [2, 4, 8, 16], + "window_size": 4, + "projection_dim": self.hidden_size, + "drop_path_rate": 0.0, + "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"], + }, + text_config={ + "vocab_size": self.vocab_size, + "d_model": self.hidden_size, + "encoder_layers": self.encoder_layers, + "decoder_layers": self.decoder_layers, + "encoder_attention_heads": self.num_attention_heads, + "decoder_attention_heads": self.num_attention_heads, + "encoder_ffn_dim": self.hidden_size * 2, + "decoder_ffn_dim": self.hidden_size * 2, + "max_position_embeddings": 128, + "dropout": 0.0, + "attention_dropout": 0.0, + "activation_dropout": 0.0, + "use_cache": True, + }, + projection_dim=self.hidden_size, + vocab_size=self.vocab_size, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + ) + + def prepare_config_and_inputs(self): + config = self.get_config() + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype=paddle.int64) + decoder_input_ids = ids_tensor( + [self.batch_size, self.decoder_seq_length], self.vocab_size, dtype=paddle.int64 + ) + labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size, dtype=paddle.int64) + attention_mask = paddle.ones([self.batch_size, self.seq_length], dtype="int64") + pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size]) + return config, input_ids, attention_mask, decoder_input_ids, labels, pixel_values + + def prepare_config_and_inputs_for_common(self): + config, input_ids, attention_mask, decoder_input_ids, _, pixel_values = self.prepare_config_and_inputs() + inputs_dict = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "decoder_input_ids": decoder_input_ids, + "pixel_values": pixel_values, + "use_cache": False, + } + return config, inputs_dict + + def create_and_check_model(self, config, input_ids, attention_mask, decoder_input_ids, labels, pixel_values): + model = Florence2ForConditionalGeneration(config) + model.eval() + + with paddle.no_grad(): + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + pixel_values=pixel_values, + use_cache=False, + ) + loss_outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values, + labels=labels, + use_cache=False, + ) + + self.parent.assertEqual( + list(outputs.logits.shape), [self.batch_size, self.decoder_seq_length, self.vocab_size] + ) + self.parent.assertEqual(loss_outputs.loss.ndim, 0) + + +class Florence2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + base_model_class = Florence2ForConditionalGeneration + all_model_classes = (Florence2ForConditionalGeneration,) + all_generative_model_classes = {Florence2ForConditionalGeneration: (None, "florence2")} + is_encoder_decoder = True + has_attentions = False + test_mismatched_shapes = False + + def setUp(self): + self.model_tester = Florence2ModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=Florence2Config, + common_properties=["vocab_size"], + vocab_size=100, + projection_dim=32, + ) + + def test_config(self): + self.config_tester.create_and_test_config_common_properties() + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_classes() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + + def test_florence2_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_save_load(self): + super().test_save_load() + + def test_determinism(self): + super().test_determinism() + + def test_hidden_states_output(self): + super().test_hidden_states_output() + + def test_resize_tokens_embeddings(self): + super().test_resize_tokens_embeddings() + + def _get_generation_inputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + input_ids = inputs_dict["input_ids"][:1].clone() + attention_mask = inputs_dict["attention_mask"][:1].clone() + pixel_values = inputs_dict["pixel_values"][:1].clone() + return config, input_ids, attention_mask, pixel_values + + def test_greedy_generate(self): + config, input_ids, attention_mask, pixel_values = self._get_generation_inputs() + model = Florence2ForConditionalGeneration(config) + model.eval() + + with paddle.no_grad(): + generated = model.generate( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + max_new_tokens=3, + decode_strategy="greedy_search", + )[0] + + self.assertEqual(generated.shape[0], input_ids.shape[0]) + self.assertGreaterEqual(generated.shape[1], 1) + + def test_beam_search_generate(self): + config, input_ids, attention_mask, pixel_values = self._get_generation_inputs() + model = Florence2ForConditionalGeneration(config) + model.eval() + + with paddle.no_grad(): + inputs_embeds = model.get_input_embeddings()(input_ids) + image_features = model._encode_image(pixel_values) + inputs_embeds, encoder_attention_mask = model._merge_image_features( + image_features, inputs_embeds, attention_mask + ) + encoder_output = model.get_encoder()( + input_ids=None, + attention_mask=encoder_attention_mask, + inputs_embeds=inputs_embeds, + ).last_hidden_state + num_beams = 2 + decoder_input_ids = paddle.full( + [input_ids.shape[0] * num_beams, 1], + config.decoder_start_token_id, + dtype=input_ids.dtype, + ) + beam_scorer = BeamSearchScorer( + batch_size=input_ids.shape[0], + max_length=4, + num_beams=num_beams, + ) + generated = model.language_model.beam_search( + decoder_input_ids, + beam_scorer, + logits_processors=LogitsProcessorList(), + max_length=4, + diversity_rate=0.0, + pad_token_id=config.pad_token_id, + eos_token_id=config.eos_token_id, + encoder_output=BaseModelOutput( + last_hidden_state=encoder_output.repeat_interleave(num_beams, axis=0) + ), + attention_mask=encoder_attention_mask.repeat_interleave(num_beams, axis=0), + )[0] + + self.assertEqual(generated.shape[0], input_ids.shape[0]) + self.assertGreaterEqual(generated.shape[1], 1) + + def test_sample_generate(self): + config, input_ids, attention_mask, pixel_values = self._get_generation_inputs() + model = Florence2ForConditionalGeneration(config) + model.eval() + + paddle.seed(1234) + with paddle.no_grad(): + generated = model.generate( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + max_new_tokens=3, + decode_strategy="sampling", + top_k=10, + )[0] + + self.assertEqual(generated.shape[0], input_ids.shape[0]) + self.assertGreaterEqual(generated.shape[1], 1) + + def test_generate_without_input_ids(self): + # Florence2 needs either image-conditioned embeddings or explicit text ids. + pass + + def test_group_beam_search_generate(self): + # Group beam search coverage is not required for Florence2. + pass + + def test_paddleformers_sft_labels(self): + model = Florence2ForConditionalGeneration(self.model_tester.get_config()) + input_ids = paddle.to_tensor([[10, 11, 12, 20, 21, 2]]) + labels = paddle.to_tensor([[-100, -100, 20, 21, 2, -100]]) + source_ids, decoder_labels, source_mask = model._split_sft_inputs(input_ids, labels, None) + self.assertEqual(source_ids.tolist(), [[10, 11, 12]]) + self.assertEqual(decoder_labels.tolist(), [[20, 21, 2]]) + self.assertEqual(source_mask.tolist(), [[1, 1, 1]]) + + +class Florence2ModelIntegrationTest(ModelTesterPretrainedMixin, unittest.TestCase): + base_model_class = Florence2ForConditionalGeneration + hf_remote_test_model_path = None + paddlehub_remote_test_model_path = None + + @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.") + def test_model_from_pretrained_paddle_hub(self): + pass + + @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.") + def test_model_from_config_paddle_hub(self): + pass + + @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.") + def test_pretrained_save_and_load(self): + pass + + +class Florence2ModelLocalPretrainedTest(unittest.TestCase): + def test_local_save_load_consistency(self): + config, inputs_dict = Florence2ModelTester(self).prepare_config_and_inputs_for_common() + model = Florence2ForConditionalGeneration(config) + model.eval() + + with paddle.no_grad(): + expected = model(**inputs_dict).logits + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname, save_to_hf=False, save_checkpoint_format="") + loaded = Florence2ForConditionalGeneration.from_pretrained( + tmpdirname, convert_from_hf=False, load_checkpoint_format="" + ) + loaded.eval() + with paddle.no_grad(): + actual = loaded(**inputs_dict).logits + + self.assertLessEqual(np.max(np.abs(expected.numpy() - actual.numpy())), 1e-5) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/transformers/florence2/test_processor.py b/tests/transformers/florence2/test_processor.py new file mode 100644 index 00000000000..17cc4cfbfe9 --- /dev/null +++ b/tests/transformers/florence2/test_processor.py @@ -0,0 +1,61 @@ +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); + +import unittest + +from paddleformers.datasets.template.mm_plugin import get_mm_plugin +from paddleformers.transformers import Florence2Processor + + +class Florence2ProcessorTest(unittest.TestCase): + def setUp(self): + self.processor = object.__new__(Florence2Processor) + self.processor.task_prompts_without_inputs = {"": "What does the image describe?"} + self.processor.task_prompts_with_input = {"": "What does the region {input} describe?"} + + def test_construct_prompts(self): + prompts = self.processor._construct_prompts(["", ""]) + self.assertEqual(prompts[0], "What does the image describe?") + self.assertIn("", prompts[1]) + + def test_post_process_detection(self): + result = self.processor.post_process_generation( + "cat", "", (100, 200) + ) + self.assertEqual(result[""]["labels"], ["cat"]) + self.assertEqual(len(result[""]["bboxes"][0]), 4) + + def test_post_process_segmentation(self): + result = self.processor.post_process_generation( + "cat", + "", + (100, 200), + ) + self.assertEqual(result[""]["labels"], ["cat"]) + self.assertEqual(len(result[""]["polygons"][0][0]), 6) + + def test_post_process_region_proposal(self): + result = self.processor.post_process_generation( + "", + "", + (100, 200), + ) + self.assertEqual(len(result[""]["bboxes"]), 1) + + def test_sft_message_format(self): + self.processor.image_processor = object() + plugin = get_mm_plugin("florence2", image_token="") + messages = [ + {"role": "user", "content": ""}, + {"role": "assistant", "content": "A solid color image."}, + ] + processed = plugin.process_messages(messages, ["image.jpg"], [], [], {}, self.processor) + self.assertEqual(processed[0]["content"], "What does the image describe?") + self.assertEqual(processed[1]["content"], "A solid color image.") + with self.assertRaisesRegex(ValueError, "exactly one image"): + plugin.process_messages([], [], [], [], {}, self.processor) + + +if __name__ == "__main__": + unittest.main()