diff --git a/.github/workflows/model-unittest-gpu-ce-develop.yml b/.github/workflows/model-unittest-gpu-ce-develop.yml
index 8bdf6fcd7ce..a8f4e4b595e 100644
--- a/.github/workflows/model-unittest-gpu-ce-develop.yml
+++ b/.github/workflows/model-unittest-gpu-ce-develop.yml
@@ -25,6 +25,7 @@ on:
           - 'qwen2'
           - 'gemma3_text'
           - 'paddleocr_vl'
+          - 'florence2'
       FLAGS_enable_CE:
         required: false
         default: 'CE_Develop_cu130_py312'
@@ -455,4 +456,4 @@ jobs:
           echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY
           echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY
           echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY
-          echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+          echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/model-unittest-gpu-ce-release.yml b/.github/workflows/model-unittest-gpu-ce-release.yml
index 96f5d597efe..d3ff170cf84 100644
--- a/.github/workflows/model-unittest-gpu-ce-release.yml
+++ b/.github/workflows/model-unittest-gpu-ce-release.yml
@@ -30,6 +30,7 @@ on:
           - 'qwen2'
           - 'gemma3_text'
           - 'paddleocr_vl'
+          - 'florence2'
       FLAGS_enable_CE:
         required: false
         default: 'CE_Release_cu129_py312_nightly'
@@ -483,4 +484,4 @@ jobs:
           echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY
           echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY
           echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY
-          echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
\ No newline at end of file
+          echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/model-unittest-gpu.yml b/.github/workflows/model-unittest-gpu.yml
index a736c480446..4d1fdf281bb 100644
--- a/.github/workflows/model-unittest-gpu.yml
+++ b/.github/workflows/model-unittest-gpu.yml
@@ -23,6 +23,7 @@ on:
           - 'qwen2'
           - 'gemma3_text'
           - 'paddleocr_vl'
+          - 'florence2'
           - 'qwen2_moe'
           - 'qwen3_vl'
           - 'qwen3_vl_moe'
@@ -488,4 +489,4 @@ jobs:
           else:
               res = gh("POST", f"{base}/issues/{pr_number}/comments", {"body": comment})
               print(f"Created comment: {res.get('html_url')}")
-          PYEOF
\ No newline at end of file
+          PYEOF
diff --git a/examples/config/sft-vl/florence2_full_300_steps.yaml b/examples/config/sft-vl/florence2_full_300_steps.yaml
new file mode 100644
index 00000000000..47325e580a2
--- /dev/null
+++ b/examples/config/sft-vl/florence2_full_300_steps.yaml
@@ -0,0 +1,48 @@
+### data
+# JSONL example:
+# {"messages":[{"role":"user","content":"<image><CAPTION>"},{"role":"assistant","content":"A cat."}],
+#  "images":["/path/to/image.jpg"]}
+train_dataset_type: messages
+eval_dataset_type: messages
+train_dataset_path: ./florence2_train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./florence2_train.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 1024
+packing: false
+mix_strategy: concat
+template_backend: custom
+template: florence2
+
+### model
+model_name_or_path: /home/housaijie/code/Florence-2-base
+continue_training: true
+
+### finetuning
+stage: VL-SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: false
+per_device_train_batch_size: 1
+max_steps: 300
+save_strategy: "no"
+logging_steps: 1
+gradient_accumulation_steps: 1
+output_dir: ./checkpoints/florence2-sft-full
+disable_tqdm: true
+
+### train
+warmup_steps: 0
+learning_rate: 1.0e-5
+
+### performance
+tensor_model_parallel_size: 1
+pipeline_model_parallel_size: 1
+bf16: true
+fp16: false
+fp16_opt_level: O1
+convert_from_hf: true
+unified_checkpoint: false
+save_checkpoint_format: "flex_checkpoint"
+load_checkpoint_format: ""
diff --git a/paddleformers/datasets/template/mm_plugin.py b/paddleformers/datasets/template/mm_plugin.py
index 3b5c124237f..89c83ddbd0e 100644
--- a/paddleformers/datasets/template/mm_plugin.py
+++ b/paddleformers/datasets/template/mm_plugin.py
@@ -392,6 +392,34 @@ def get_mm_inputs(
         return self._get_mm_inputs(images, videos, audios, processor, **kwargs)
 
 
+@dataclass
+class Florence2Plugin(BasePlugin):
+    @override
+    def process_messages(
+        self,
+        messages,
+        images,
+        videos,
+        audios,
+        mm_inputs,
+        processor,
+    ):
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+        if videos or audios:
+            raise ValueError("Florence-2 only supports image inputs.")
+        if len(images) != 1:
+            raise ValueError("Florence-2 supports exactly one image per prompt.")
+
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"].replace(IMAGE_PLACEHOLDER, "").strip()
+            if message["role"] == "user" and hasattr(processor, "_construct_prompts"):
+                content = processor._construct_prompts([content])[0]
+            message["content"] = content
+        return messages
+
+
 @dataclass
 class PaddleOCRVLPlugin(BasePlugin):
     image_bos_token: str = "<|IMAGE_START|>"
@@ -1496,6 +1524,7 @@ def process_messages(
 
 PLUGINS = {
     "base": BasePlugin,
+    "florence2": Florence2Plugin,
     "ernie_vl": ErnieVLPlugin,
     "qwen2_vl": Qwen2VLPlugin,
     "paddleocr_vl": PaddleOCRVLPlugin,
diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
index 6d0d42cd39c..1abae2fd8f7 100644
--- a/paddleformers/datasets/template/template.py
+++ b/paddleformers/datasets/template/template.py
@@ -648,6 +648,16 @@ def get_template_and_fix_tokenizer(dataset_config) -> "Template":
     mm_plugin=get_mm_plugin(name="paddleocr_vl", image_token="<|IMAGE_PLACEHOLDER|>"),
 )
 
+register_template(
+    name="florence2",
+    format_user=StringFormatter(slots=["{{content}}"]),
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+    format_system=StringFormatter(slots=["{{content}}"]),
+    format_prefix=EmptyFormatter(slots=[]),
+    suffix=["</s>"],
+    mm_plugin=get_mm_plugin(name="florence2", image_token="<image>"),
+)
+
 # copied from chatml template
 register_template(
     name="qwen",
diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
index ae1c4f20c73..aff47efddf1 100644
--- a/paddleformers/transformers/__init__.py
+++ b/paddleformers/transformers/__init__.py
@@ -257,6 +257,14 @@
     "qwen2_vl.processor": ["Qwen2VLProcessor"],
     "qwen2_vl.video_processor": ["Qwen2VLVideoProcessor"],
     "qwen2_vl.vision_process": ["process_vision_info"],
+    "florence2.configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"],
+    "florence2.image_processor": ["Florence2ImageProcessor"],
+    "florence2.modeling": [
+        "Florence2ForConditionalGeneration",
+        "Florence2LanguageForConditionalGeneration",
+        "Florence2VisionModel",
+    ],
+    "florence2.processor": ["Florence2Processor"],
     "qwen3.configuration": ["Qwen3Config"],
     "qwen3.modeling": [
         "Qwen3Model",
@@ -290,6 +298,7 @@
     "llama": [],
     "qwen2": [],
     "glm_ocr": [],
+    "florence2": [],
     "qwen3": [],
     "deepseek_v3": [],
     "ernie4_5": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5_ForCausalLM"],
diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
index c04e1f34a5a..bb0a6385446 100644
--- a/paddleformers/transformers/auto/configuration.py
+++ b/paddleformers/transformers/auto/configuration.py
@@ -59,6 +59,7 @@
         ("gpt_oss", "GptOssConfig"),
         ("phi3", "Phi3Config"),
         ("gemma3_text", "Gemma3TextConfig"),
+        ("florence2", "Florence2Config"),
         ("glm4v_moe", "Glm4vMoeConfig"),
         ("glm_ocr", "GlmOcrConfig"),
         ("qwen3_5", "Qwen3_5Config"),
@@ -88,6 +89,7 @@
         ("qwen3_vl_text", "Qwen3VL"),
         ("qwen3_vl_moe", "Qwen3VLMoe"),
         ("qwen3_vl_moe_text", "Qwen3VLMoeText"),
+        ("florence2", "Florence2ForConditionalGeneration"),
         ("glm_ocr", "GlmOcrForConditionalGeneration"),
         ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
diff --git a/paddleformers/transformers/auto/image_processing.py b/paddleformers/transformers/auto/image_processing.py
index 4244259c0e1..3a8b137e85d 100644
--- a/paddleformers/transformers/auto/image_processing.py
+++ b/paddleformers/transformers/auto/image_processing.py
@@ -55,6 +55,7 @@
         "glm4v_moe": ("Glm4vImageProcessor", "Glm4vImageProcessorFast"),
         "kimi_k25": ("KimiK25VisionProcessor"),
         "paddleocr_vl": ("PaddleOCRVLImageProcessor"),
+        "florence2": ("Florence2ImageProcessor"),
         "qwen2_5_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
         "qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
         "qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"),
diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
index 11321baba1f..ed1bbb10bad 100644
--- a/paddleformers/transformers/auto/modeling.py
+++ b/paddleformers/transformers/auto/modeling.py
@@ -78,6 +78,7 @@
         ("GptOss", "gpt_oss"),
         ("Phi3", "phi3"),
         ("Gemma3", "gemma3_text"),
+        ("Florence2", "florence2"),
         ("Glm4vMoe", "glm4v_moe"),
         ("GlmOcr", "glm_ocr"),
     ]
diff --git a/paddleformers/transformers/auto/processing.py b/paddleformers/transformers/auto/processing.py
index bca898e350d..d8e401c7232 100644
--- a/paddleformers/transformers/auto/processing.py
+++ b/paddleformers/transformers/auto/processing.py
@@ -54,6 +54,7 @@
         ("qwen2_vl", "Qwen2VLProcessor"),
         ("qwen3_omni_moe", "Qwen3OmniMoeProcessor"),
         ("paddleocr_vl", "PaddleOCRVLProcessor"),
+        ("florence2", "Florence2Processor"),
         ("ernie4_5_moe_vl", "Ernie4_5_VLProcessor"),
         ("glm4v_moe", "Glm4vProcessor"),
         ("glm_ocr", "Glm46VProcessor"),
diff --git a/paddleformers/transformers/florence2/__init__.py b/paddleformers/transformers/florence2/__init__.py
new file mode 100644
index 00000000000..48ad2e78dda
--- /dev/null
+++ b/paddleformers/transformers/florence2/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"],
+    "image_processor": ["Florence2ImageProcessor"],
+    "modeling": [
+        "Florence2ForConditionalGeneration",
+        "Florence2LanguageForConditionalGeneration",
+        "Florence2VisionModel",
+    ],
+    "processor": ["Florence2Processor"],
+}
+
+if TYPE_CHECKING:
+    from .configuration import *
+    from .image_processor import *
+    from .modeling import *
+    from .processor import *
+else:
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], import_structure, module_spec=__spec__)
+
diff --git a/paddleformers/transformers/florence2/configuration.py b/paddleformers/transformers/florence2/configuration.py
new file mode 100644
index 00000000000..4e983636b95
--- /dev/null
+++ b/paddleformers/transformers/florence2/configuration.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..configuration_utils import PretrainedConfig
+
+__all__ = ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"]
+
+
+class Florence2VisionConfig(PretrainedConfig):
+    model_type = "davit"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        drop_path_rate=0.1,
+        patch_size=(7, 3, 3, 3),
+        patch_stride=(4, 2, 2, 2),
+        patch_padding=(3, 1, 1, 1),
+        patch_prenorm=(False, True, True, True),
+        enable_checkpoint=False,
+        dim_embed=(128, 256, 512, 1024),
+        num_heads=(4, 8, 16, 32),
+        num_groups=(4, 8, 16, 32),
+        depths=(1, 1, 9, 1),
+        window_size=12,
+        projection_dim=768,
+        visual_temporal_embedding=None,
+        image_pos_embed=None,
+        image_feature_source=("spatial_avg_pool", "temporal_avg_pool"),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.drop_path_rate = drop_path_rate
+        self.patch_size = list(patch_size)
+        self.patch_stride = list(patch_stride)
+        self.patch_padding = list(patch_padding)
+        self.patch_prenorm = list(patch_prenorm)
+        self.enable_checkpoint = enable_checkpoint
+        self.dim_embed = list(dim_embed)
+        self.num_heads = list(num_heads)
+        self.num_groups = list(num_groups)
+        self.depths = list(depths)
+        self.window_size = window_size
+        self.projection_dim = projection_dim
+        self.visual_temporal_embedding = visual_temporal_embedding or {
+            "type": "COSINE",
+            "max_temporal_embeddings": 100,
+        }
+        self.image_pos_embed = image_pos_embed or {"type": "learned_abs_2d", "max_pos_embeddings": 50}
+        self.image_feature_source = list(image_feature_source)
+
+
+class Florence2LanguageConfig(PretrainedConfig):
+    model_type = "florence2_language"
+    base_config_key = "text_config"
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=51289,
+        max_position_embeddings=1024,
+        encoder_layers=6,
+        encoder_ffn_dim=3072,
+        encoder_attention_heads=12,
+        decoder_layers=6,
+        decoder_ffn_dim=3072,
+        decoder_attention_heads=12,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=768,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.1,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.activation_function = activation_function
+        self.d_model = d_model
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.init_std = init_std
+        self.classifier_dropout = classifier_dropout
+        self.scale_embedding = scale_embedding
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+
+
+class Florence2Config(PretrainedConfig):
+    model_type = "florence2"
+    tokenizer_class = "BartTokenizer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"vision_config": Florence2VisionConfig, "text_config": Florence2LanguageConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        vocab_size=51289,
+        projection_dim=768,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        self.vision_config = (
+            Florence2VisionConfig(**vision_config) if isinstance(vision_config, dict) else vision_config
+        ) or Florence2VisionConfig()
+        self.text_config = (
+            Florence2LanguageConfig(**text_config) if isinstance(text_config, dict) else text_config
+        ) or Florence2LanguageConfig()
+        self.ignore_index = ignore_index
+        self.vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        self.tokenizer_class = "BartTokenizer"
+        self.decoder_start_token_id = self.text_config.decoder_start_token_id
+        self.forced_eos_token_id = self.text_config.forced_eos_token_id
+        self.use_cache = self.text_config.use_cache
diff --git a/paddleformers/transformers/florence2/image_processor.py b/paddleformers/transformers/florence2/image_processor.py
new file mode 100644
index 00000000000..99420932fdc
--- /dev/null
+++ b/paddleformers/transformers/florence2/image_processor.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import transformers as hf
+
+from ..image_processing_utils import warp_base_image_processor
+
+Florence2ImageProcessor = warp_base_image_processor(hf.CLIPImageProcessor)
+
+__all__ = ["Florence2ImageProcessor"]
+
diff --git a/paddleformers/transformers/florence2/modeling.py b/paddleformers/transformers/florence2/modeling.py
new file mode 100644
index 00000000000..bfe4b537f42
--- /dev/null
+++ b/paddleformers/transformers/florence2/modeling.py
@@ -0,0 +1,939 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import math
+from typing import Optional
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from ...generation import GenerationMixin
+from ..activations import ACT2FN
+from ..model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ..model_utils import PretrainedModel
+from .configuration import Florence2Config, Florence2LanguageConfig, Florence2VisionConfig
+
+__all__ = [
+    "Florence2ForConditionalGeneration",
+    "Florence2LanguageForConditionalGeneration",
+    "Florence2VisionModel",
+]
+
+
+def _expand_mask(mask, dtype, target_length=None):
+    target_length = target_length or mask.shape[-1]
+    expanded = mask[:, None, None, :].expand([mask.shape[0], 1, target_length, mask.shape[-1]]).astype(dtype)
+    return paddle.where(expanded > 0, paddle.zeros_like(expanded), paddle.full_like(expanded, paddle.finfo(dtype).min))
+
+
+def _causal_mask(batch_size, target_length, past_length, dtype):
+    rows = paddle.arange(target_length)[:, None] + past_length
+    cols = paddle.arange(target_length + past_length)[None, :]
+    allowed = cols <= rows
+    mask = paddle.where(
+        allowed,
+        paddle.zeros([target_length, target_length + past_length], dtype=dtype),
+        paddle.full([target_length, target_length + past_length], paddle.finfo(dtype).min, dtype=dtype),
+    )
+    return mask[None, None, :, :].expand([batch_size, 1, target_length, target_length + past_length])
+
+
+def shift_tokens_right(input_ids, pad_token_id, decoder_start_token_id):
+    shifted = paddle.zeros_like(input_ids)
+    shifted[:, 1:] = input_ids[:, :-1].clone()
+    shifted[:, 0] = decoder_start_token_id
+    return paddle.where(shifted == -100, paddle.full_like(shifted, pad_token_id), shifted)
+
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=0.0):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        if self.drop_prob == 0.0 or not self.training:
+            return x
+        keep_prob = 1.0 - self.drop_prob
+        shape = [x.shape[0]] + [1] * (x.ndim - 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+        return x / keep_prob * paddle.floor(random_tensor)
+
+
+class LearnedAbsolutePositionEmbedding2D(nn.Layer):
+    def __init__(self, embedding_dim=256, num_pos=50):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - embedding_dim // 2)
+
+    def forward(self, pixel_values):
+        height, width = pixel_values.shape[1:3]
+        x_emb = self.column_embeddings(paddle.arange(width))
+        y_emb = self.row_embeddings(paddle.arange(height))
+        pos = paddle.concat(
+            [
+                x_emb.unsqueeze(0).tile([height, 1, 1]),
+                y_emb.unsqueeze(1).tile([1, width, 1]),
+            ],
+            axis=-1,
+        )
+        return pos.unsqueeze(0).tile([pixel_values.shape[0], 1, 1, 1])
+
+
+class PositionalEmbeddingCosine1D(nn.Layer):
+    def __init__(self, embed_dim=512, max_seq_len=1024):
+        super().__init__()
+        denominator = paddle.exp(-math.log(10000) * paddle.arange(0, embed_dim, 2, dtype="float32") / embed_dim)
+        frequencies = paddle.arange(max_seq_len, dtype="float32").reshape([max_seq_len, 1]) * denominator
+        values = paddle.zeros([max_seq_len, embed_dim])
+        values[:, 0::2] = paddle.sin(frequencies)
+        values[:, 1::2] = paddle.cos(frequencies)
+        self.register_buffer("pos_idx_to_embed", values, persistable=True)
+
+    def forward(self, seq_embeds):
+        values = self.pos_idx_to_embed[: seq_embeds.shape[-2]]
+        return values.unsqueeze(0) if seq_embeds.ndim == 3 else values
+
+
+class PreNorm(nn.Layer):
+    def __init__(self, norm, fn, drop_path=None):
+        super().__init__()
+        self.norm = norm
+        self.fn = fn
+        self.drop_path = drop_path
+
+    def forward(self, x, *args):
+        shortcut = x
+        x, size = self.fn(self.norm(x) if self.norm is not None else x, *args)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        return shortcut + x, size
+
+
+class MlpNet(nn.Layer):
+    def __init__(self, in_features, hidden_features):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, in_features)
+
+    def forward(self, x):
+        return self.fc2(self.act(self.fc1(x)))
+
+
+class Mlp(nn.Layer):
+    def __init__(self, in_features, hidden_features):
+        super().__init__()
+        self.net = MlpNet(in_features, hidden_features)
+
+    def forward(self, x, size):
+        return self.net(x), size
+
+
+class DepthWiseConv2d(nn.Layer):
+    def __init__(self, dim_in, kernel_size, padding, stride):
+        super().__init__()
+        self.dw = nn.Conv2D(dim_in, dim_in, kernel_size, stride=stride, padding=padding, groups=dim_in)
+
+    def forward(self, x, size):
+        batch_size, _, channels = x.shape
+        height, width = size
+        x = self.dw(x.transpose([0, 2, 1]).reshape([batch_size, channels, height, width]))
+        size = (x.shape[-2], x.shape[-1])
+        return x.flatten(2).transpose([0, 2, 1]), size
+
+
+class ConvEmbed(nn.Layer):
+    def __init__(self, patch_size, in_chans, embed_dim, stride, padding, pre_norm):
+        super().__init__()
+        self.proj = nn.Conv2D(in_chans, embed_dim, patch_size, stride=stride, padding=padding)
+        self.norm = nn.LayerNorm(in_chans if pre_norm else embed_dim)
+        self.pre_norm = pre_norm
+
+    def forward(self, x, size):
+        height, width = size
+        if x.ndim == 3:
+            if self.pre_norm:
+                x = self.norm(x)
+            x = x.reshape([x.shape[0], height, width, x.shape[-1]]).transpose([0, 3, 1, 2])
+        x = self.proj(x)
+        height, width = x.shape[-2:]
+        x = x.flatten(2).transpose([0, 2, 1])
+        if not self.pre_norm:
+            x = self.norm(x)
+        return x, (height, width)
+
+
+class ChannelAttention(nn.Layer):
+    def __init__(self, dim, groups):
+        super().__init__()
+        self.groups = groups
+        self.qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, size):
+        batch_size, num_tokens, channels = x.shape
+        qkv = self.qkv(x).reshape([batch_size, num_tokens, 3, self.groups, channels // self.groups])
+        q, k, v = paddle.unbind(qkv.transpose([2, 0, 3, 1, 4]), axis=0)
+        attention = paddle.matmul(
+            (q * (float(num_tokens) ** -0.5)).transpose([0, 1, 3, 2]),
+            k,
+        )
+        attention = F.softmax(attention, axis=-1)
+        x = paddle.matmul(attention, v.transpose([0, 1, 3, 2])).transpose([0, 1, 3, 2])
+        x = self.proj(x.transpose([0, 2, 1, 3]).reshape([batch_size, num_tokens, channels]))
+        return x, size
+
+
+def window_partition(x, window_size):
+    batch_size, height, width, channels = x.shape
+    x = x.reshape([batch_size, height // window_size, window_size, width // window_size, window_size, channels])
+    return x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, window_size, window_size, channels])
+
+
+def window_reverse(windows, batch_size, window_size, height, width):
+    x = windows.reshape(
+        [batch_size, height // window_size, width // window_size, window_size, window_size, windows.shape[-1]]
+    )
+    return x.transpose([0, 1, 3, 2, 4, 5]).reshape([batch_size, height, width, windows.shape[-1]])
+
+
+class WindowAttention(nn.Layer):
+    def __init__(self, dim, num_heads, window_size):
+        super().__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.scale = float(dim // num_heads) ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x, size):
+        height, width = size
+        batch_size, _, channels = x.shape
+        x = x.reshape([batch_size, height, width, channels])
+        pad_right = (self.window_size - width % self.window_size) % self.window_size
+        pad_bottom = (self.window_size - height % self.window_size) % self.window_size
+        if pad_right or pad_bottom:
+            x = F.pad(x.transpose([0, 3, 1, 2]), [0, pad_right, 0, pad_bottom]).transpose([0, 2, 3, 1])
+        padded_height, padded_width = x.shape[1:3]
+        x = window_partition(x, self.window_size).reshape([-1, self.window_size**2, channels])
+        qkv = self.qkv(x).reshape([-1, self.window_size**2, 3, self.num_heads, channels // self.num_heads])
+        q, k, v = paddle.unbind(qkv.transpose([2, 0, 3, 1, 4]), axis=0)
+        attention = F.softmax(paddle.matmul(q * self.scale, k.transpose([0, 1, 3, 2])), axis=-1)
+        x = paddle.matmul(attention, v).transpose([0, 2, 1, 3]).reshape([-1, self.window_size**2, channels])
+        x = self.proj(x).reshape([-1, self.window_size, self.window_size, channels])
+        x = window_reverse(x, batch_size, self.window_size, padded_height, padded_width)
+        return x[:, :height, :width].reshape([batch_size, height * width, channels]), size
+
+
+class SpatialBlock(nn.Layer):
+    def __init__(self, dim, num_heads, window_size, drop_path_rate):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else None
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
+        self.window_attn = PreNorm(nn.LayerNorm(dim), WindowAttention(dim, num_heads, window_size), drop_path)
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
+        self.ffn = PreNorm(nn.LayerNorm(dim), Mlp(dim, dim * 4), drop_path)
+
+    def forward(self, x, size):
+        x, size = self.conv1(x, size)
+        x, size = self.window_attn(x, size)
+        x, size = self.conv2(x, size)
+        return self.ffn(x, size)
+
+
+class ChannelBlock(nn.Layer):
+    def __init__(self, dim, groups, drop_path_rate):
+        super().__init__()
+        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else None
+        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
+        self.channel_attn = PreNorm(nn.LayerNorm(dim), ChannelAttention(dim, groups), drop_path)
+        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1))
+        self.ffn = PreNorm(nn.LayerNorm(dim), Mlp(dim, dim * 4), drop_path)
+
+    def forward(self, x, size):
+        x, size = self.conv1(x, size)
+        x, size = self.channel_attn(x, size)
+        x, size = self.conv2(x, size)
+        return self.ffn(x, size)
+
+
+class DaViTBlock(nn.Layer):
+    def __init__(self, dim, num_heads, groups, window_size, spatial_drop, channel_drop):
+        super().__init__()
+        self.spatial_block = SpatialBlock(dim, num_heads, window_size, spatial_drop)
+        self.channel_block = ChannelBlock(dim, groups, channel_drop)
+
+    def forward(self, x, size):
+        x, size = self.spatial_block(x, size)
+        return self.channel_block(x, size)
+
+
+class DaViT(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        dpr = paddle.linspace(0, config.drop_path_rate, sum(config.depths) * 2).tolist()
+        self.convs = nn.LayerList()
+        self.blocks = nn.LayerList()
+        offset = 0
+        for index, depth in enumerate(config.depths):
+            self.convs.append(
+                ConvEmbed(
+                    config.patch_size[index],
+                    3 if index == 0 else config.dim_embed[index - 1],
+                    config.dim_embed[index],
+                    config.patch_stride[index],
+                    config.patch_padding[index],
+                    config.patch_prenorm[index],
+                )
+            )
+            self.blocks.append(
+                nn.LayerList(
+                    [
+                        DaViTBlock(
+                            config.dim_embed[index],
+                            config.num_heads[index],
+                            config.num_groups[index],
+                            config.window_size,
+                            dpr[offset + layer * 2],
+                            dpr[offset + layer * 2 + 1],
+                        )
+                        for layer in range(depth)
+                    ]
+                )
+            )
+            offset += depth * 2
+
+    def forward_features_unpool(self, x):
+        size = (x.shape[2], x.shape[3])
+        for conv, blocks in zip(self.convs, self.blocks):
+            x, size = conv(x, size)
+            for block in blocks:
+                x, size = block(x, size)
+        return x
+
+
+class Florence2Attention(nn.Layer):
+    def __init__(self, embed_dim, num_heads, dropout=0.0, is_decoder=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.out_proj = nn.Linear(embed_dim, embed_dim)
+
+    def _shape(self, tensor, seq_len, batch_size):
+        return tensor.reshape([batch_size, seq_len, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
+
+    def forward(
+        self,
+        hidden_states,
+        key_value_states=None,
+        past_key_value=None,
+        attention_mask=None,
+        layer_head_mask=None,
+        output_attentions=False,
+    ):
+        is_cross_attention = key_value_states is not None
+        batch_size, target_length, _ = hidden_states.shape
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, target_length, batch_size)
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            key_states, value_states = past_key_value
+        elif is_cross_attention:
+            key_states = self._shape(self.k_proj(key_value_states), key_value_states.shape[1], batch_size)
+            value_states = self._shape(self.v_proj(key_value_states), key_value_states.shape[1], batch_size)
+        else:
+            key_states = self._shape(self.k_proj(hidden_states), target_length, batch_size)
+            value_states = self._shape(self.v_proj(hidden_states), target_length, batch_size)
+            if past_key_value is not None:
+                key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+                value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+        present = (key_states, value_states) if self.is_decoder else None
+        attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2]))
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        attn_weights = F.softmax(attn_weights, axis=-1)
+        if layer_head_mask is not None:
+            attn_weights = attn_weights * layer_head_mask.reshape([1, -1, 1, 1])
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+        output = paddle.matmul(attn_probs, value_states).transpose([0, 2, 1, 3])
+        output = self.out_proj(output.reshape([batch_size, target_length, self.embed_dim]))
+        return output, attn_weights if output_attentions else None, present
+
+
+class Florence2EncoderLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = Florence2Attention(config.d_model, config.encoder_attention_heads, config.attention_dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, config.d_model)
+        self.final_layer_norm = nn.LayerNorm(config.d_model)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.dropout = config.dropout
+        self.activation_dropout = config.activation_dropout
+
+    def forward(self, hidden_states, attention_mask=None, layer_head_mask=None, output_attentions=False):
+        residual = hidden_states
+        hidden_states, weights, _ = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_layer_norm(
+            residual + F.dropout(hidden_states, p=self.dropout, training=self.training)
+        )
+        residual = hidden_states
+        hidden_states = self.fc2(
+            F.dropout(self.activation_fn(self.fc1(hidden_states)), p=self.activation_dropout, training=self.training)
+        )
+        hidden_states = self.final_layer_norm(
+            residual + F.dropout(hidden_states, p=self.dropout, training=self.training)
+        )
+        return (hidden_states, weights) if output_attentions else (hidden_states,)
+
+
+class Florence2DecoderLayer(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.self_attn = Florence2Attention(
+            config.d_model, config.decoder_attention_heads, config.attention_dropout, is_decoder=True
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.encoder_attn = Florence2Attention(
+            config.d_model, config.decoder_attention_heads, config.attention_dropout, is_decoder=True
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model)
+        self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
+        self.final_layer_norm = nn.LayerNorm(config.d_model)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.dropout = config.dropout
+        self.activation_dropout = config.activation_dropout
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        use_cache=True,
+    ):
+        residual = hidden_states
+        hidden_states, self_weights, present = self.self_attn(
+            hidden_states,
+            past_key_value=past_key_value[:2] if past_key_value is not None else None,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_layer_norm(
+            residual + F.dropout(hidden_states, p=self.dropout, training=self.training)
+        )
+        cross_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states, cross_weights, cross_present = self.encoder_attn(
+                hidden_states,
+                key_value_states=encoder_hidden_states,
+                past_key_value=past_key_value[-2:] if past_key_value is not None else None,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.encoder_attn_layer_norm(
+                residual + F.dropout(hidden_states, p=self.dropout, training=self.training)
+            )
+            present = present + cross_present
+        residual = hidden_states
+        hidden_states = self.fc2(
+            F.dropout(self.activation_fn(self.fc1(hidden_states)), p=self.activation_dropout, training=self.training)
+        )
+        hidden_states = self.final_layer_norm(
+            residual + F.dropout(hidden_states, p=self.dropout, training=self.training)
+        )
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_weights, cross_weights)
+        if use_cache:
+            outputs += (present,)
+        return outputs
+
+
+class Florence2LearnedPositionalEmbedding(nn.Embedding):
+    def __init__(self, num_embeddings, embedding_dim):
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids, past_key_values_length=0):
+        positions = paddle.arange(past_key_values_length, past_key_values_length + input_ids.shape[1], dtype="int64")
+        return super().forward(positions.unsqueeze(0).expand([input_ids.shape[0], -1]) + self.offset)
+
+
+class Florence2Encoder(nn.Layer):
+    def __init__(self, config, embed_tokens):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = embed_tokens
+        self.embed_positions = Florence2LearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+        self.layers = nn.LayerList([Florence2EncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.dropout = config.dropout
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        position_source = input_ids if input_ids is not None else paddle.zeros(inputs_embeds.shape[:2], dtype="int64")
+        hidden_states = self.layernorm_embedding(inputs_embeds + self.embed_positions(position_source))
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        expanded_mask = _expand_mask(attention_mask, hidden_states.dtype) if attention_mask is not None else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for index, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            outputs = layer(
+                hidden_states,
+                expanded_mask,
+                head_mask[index] if head_mask is not None else None,
+                output_attentions,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_attentions += (outputs[1],)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(x for x in [hidden_states, all_hidden_states, all_attentions] if x is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+class Florence2Decoder(nn.Layer):
+    def __init__(self, config, embed_tokens):
+        super().__init__()
+        self.config = config
+        self.embed_tokens = embed_tokens
+        self.embed_positions = Florence2LearnedPositionalEmbedding(config.max_position_embeddings, config.d_model)
+        self.layers = nn.LayerList([Florence2DecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.dropout = config.dropout
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=True,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        position_source = input_ids if input_ids is not None else paddle.zeros(inputs_embeds.shape[:2], dtype="int64")
+        hidden_states = self.layernorm_embedding(inputs_embeds + self.embed_positions(position_source, past_length))
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        causal_mask = _causal_mask(hidden_states.shape[0], hidden_states.shape[1], past_length, hidden_states.dtype)
+        if attention_mask is not None:
+            causal_mask = causal_mask + _expand_mask(attention_mask, hidden_states.dtype, hidden_states.shape[1])
+        encoder_mask = (
+            _expand_mask(encoder_attention_mask, hidden_states.dtype, hidden_states.shape[1])
+            if encoder_attention_mask is not None
+            else None
+        )
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+        next_cache = () if use_cache else None
+        for index, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            outputs = layer(
+                hidden_states,
+                causal_mask,
+                encoder_hidden_states,
+                encoder_mask,
+                head_mask[index] if head_mask is not None else None,
+                cross_attn_head_mask[index] if cross_attn_head_mask is not None else None,
+                past_key_values[index] if past_key_values is not None else None,
+                output_attentions,
+                use_cache,
+            )
+            hidden_states = outputs[0]
+            if use_cache:
+                next_cache += (outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attentions += (outputs[1],)
+                all_cross_attentions += (outputs[2],)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(
+                x
+                for x in [hidden_states, next_cache, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if x is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class Florence2LanguageModel(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.shared = nn.Embedding(config.vocab_size, config.d_model, padding_idx=config.pad_token_id)
+        self.encoder = Florence2Encoder(config, self.shared)
+        self.decoder = Florence2Decoder(config, self.shared)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        **kwargs,
+    ):
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids,
+                self.config.pad_token_id,
+                self.config.decoder_start_token_id,
+            )
+        encoder_outputs = encoder_outputs if encoder_outputs is not None else encoder_output
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        encoder_hidden_states = (
+            encoder_outputs.last_hidden_state if hasattr(encoder_outputs, "last_hidden_state") else encoder_outputs[0]
+        )
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_hidden_states,
+            encoder_hidden_states=getattr(encoder_outputs, "hidden_states", None),
+            encoder_attentions=getattr(encoder_outputs, "attentions", None),
+        )
+
+
+class Florence2LanguagePretrainedModel(PretrainedModel):
+    config_class = Florence2LanguageConfig
+    base_model_prefix = "model"
+    transpose_weight_keys = ["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"]
+
+
+class Florence2LanguageForConditionalGeneration(Florence2LanguagePretrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_encoder_decoder = True
+        self.model = Florence2LanguageModel(config)
+        self.register_buffer("final_logits_bias", paddle.zeros([1, config.vocab_size]), persistable=True)
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def get_input_embeddings(self):
+        return self.model.shared
+
+    def set_input_embeddings(self, value):
+        self.model.shared = value
+        self.model.encoder.embed_tokens = value
+        self.model.decoder.embed_tokens = value
+        self.config.vocab_size = value.weight.shape[0]
+        self.final_logits_bias = paddle.zeros([1, value.weight.shape[0]], dtype=value.weight.dtype)
+
+    def forward(self, labels=None, return_dict=True, **kwargs):
+        if labels is not None and kwargs.get("decoder_input_ids") is None:
+            kwargs["decoder_input_ids"] = shift_tokens_right(
+                labels, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+        outputs = self.model(return_dict=return_dict, **kwargs)
+        hidden_states = outputs.last_hidden_state if return_dict else outputs[0]
+        logits = paddle.matmul(hidden_states, self.model.shared.weight, transpose_y=True) + self.final_logits_bias
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.reshape([-1, self.config.vocab_size]),
+                labels.reshape([-1]),
+                ignore_index=-100,
+            )
+        if not return_dict:
+            return ((loss, logits) if loss is not None else (logits,)) + outputs[1:]
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {
+            "decoder_input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "encoder_output": kwargs.get("encoder_output"),
+            "attention_mask": kwargs.get("attention_mask"),
+            "use_cache": kwargs.get("use_cache", True),
+            "return_dict": True,
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        return tuple(
+            tuple(paddle.index_select(state, beam_idx, axis=0) for state in layer) for layer in past_key_values
+        )
+
+
+class Florence2PretrainedModel(PretrainedModel):
+    config_class = Florence2Config
+    base_model_prefix = ""
+    _keys_to_ignore_on_load_missing = [
+        r"language_model.model.encoder.embed_tokens.weight",
+        r"language_model.model.decoder.embed_tokens.weight",
+    ]
+    transpose_weight_keys = [
+        "qkv",
+        "proj",
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "out_proj",
+        "fc1",
+        "fc2",
+    ]
+
+
+class Florence2VisionModel(Florence2PretrainedModel):
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: Florence2VisionConfig):
+        super().__init__(config)
+        self.vision_tower = DaViT(config)
+
+    def forward(self, pixel_values):
+        return self.vision_tower.forward_features_unpool(pixel_values)
+
+
+class Florence2ForConditionalGeneration(Florence2PretrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.is_encoder_decoder = True
+        self.vision_tower = DaViT(config.vision_config)
+        image_dim = config.vision_config.dim_embed[-1]
+        projection_dim = config.vision_config.projection_dim
+        self.image_projection = self.create_parameter([image_dim, projection_dim])
+        self.image_proj_norm = nn.LayerNorm(projection_dim)
+        self.image_pos_embed = LearnedAbsolutePositionEmbedding2D(
+            image_dim, config.vision_config.image_pos_embed["max_pos_embeddings"]
+        )
+        self.visual_temporal_embed = PositionalEmbeddingCosine1D(
+            image_dim, config.vision_config.visual_temporal_embedding["max_temporal_embeddings"]
+        )
+        self.image_feature_source = config.vision_config.image_feature_source
+        self.language_model = Florence2LanguageForConditionalGeneration(config.text_config)
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+        self.config.vocab_size = value.weight.shape[0]
+        self.config.text_config.vocab_size = value.weight.shape[0]
+
+    def _encode_image(self, pixel_values):
+        batch_size = pixel_values.shape[0]
+        x = self.vision_tower.forward_features_unpool(pixel_values)
+        num_tokens = x.shape[1]
+        height = width = int(num_tokens**0.5)
+        x = x.reshape([batch_size, height, width, x.shape[-1]])
+        x = x + self.image_pos_embed(x)
+        x = x.reshape([batch_size, 1, height * width, x.shape[-1]])
+        x = x + self.visual_temporal_embed(x[:, :, 0]).reshape([1, 1, 1, x.shape[-1]])
+        features = {
+            "spatial_avg_pool": x.mean(axis=2),
+            "temporal_avg_pool": x.mean(axis=1),
+            "last_frame": x[:, -1],
+        }
+        x = paddle.concat([features[source] for source in self.image_feature_source], axis=1)
+        return self.image_proj_norm(paddle.matmul(x, self.image_projection))
+
+    def _merge_image_features(self, image_features, inputs_embeds, attention_mask=None):
+        image_mask = paddle.ones(image_features.shape[:2], dtype=inputs_embeds.dtype)
+        text_mask = (
+            attention_mask.astype(inputs_embeds.dtype)
+            if attention_mask is not None
+            else paddle.ones(inputs_embeds.shape[:2], dtype=inputs_embeds.dtype)
+        )
+        return paddle.concat([image_features, inputs_embeds], axis=1), paddle.concat([image_mask, text_mask], axis=1)
+
+    def _split_sft_inputs(self, input_ids, labels, attention_mask):
+        source_rows, label_rows = [], []
+        max_source = 1
+        max_target = 1
+        for row, label_row in zip(input_ids.tolist(), labels.tolist()):
+            target_start = next((index for index, value in enumerate(label_row) if value != -100), len(row))
+            # PaddleFormers SFT labels are shifted left once, so the first
+            # supervised label predicts the token after this source position.
+            source = row[: target_start + 1] or [self.config.bos_token_id]
+            target = [value for value in label_row[target_start:] if value != -100]
+            source_rows.append(source)
+            label_rows.append(target or [self.config.eos_token_id])
+            max_source = max(max_source, len(source))
+            max_target = max(max_target, len(target))
+        source_ids = paddle.full([len(source_rows), max_source], self.config.pad_token_id, dtype=input_ids.dtype)
+        source_mask = paddle.zeros([len(source_rows), max_source], dtype="int64")
+        decoder_labels = paddle.full([len(label_rows), max_target], -100, dtype=labels.dtype)
+        for index, (source, target) in enumerate(zip(source_rows, label_rows)):
+            source_ids[index, : len(source)] = paddle.to_tensor(source, dtype=input_ids.dtype)
+            source_mask[index, : len(source)] = 1
+            decoder_labels[index, : len(target)] = paddle.to_tensor(target, dtype=labels.dtype)
+        return source_ids, decoder_labels, source_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        encoder_output=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        **kwargs,
+    ):
+        if labels is not None and input_ids is not None and labels.shape == input_ids.shape:
+            input_ids, labels, attention_mask = self._split_sft_inputs(input_ids, labels, attention_mask)
+        image_features = None
+        if encoder_output is None and encoder_outputs is None and inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            if pixel_values is not None:
+                image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_image_features(
+                    image_features,
+                    inputs_embeds,
+                    attention_mask,
+                )
+        outputs = self.language_model(
+            input_ids=None if inputs_embeds is not None else input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_output=encoder_output if encoder_output is not None else encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        return outputs
+
+    def generate(self, input_ids=None, pixel_values=None, inputs_embeds=None, attention_mask=None, **kwargs):
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            if pixel_values is not None:
+                inputs_embeds, attention_mask = self._merge_image_features(
+                    self._encode_image(pixel_values), inputs_embeds, attention_mask
+                )
+        return self.language_model.generate(inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs)
diff --git a/paddleformers/transformers/florence2/processor.py b/paddleformers/transformers/florence2/processor.py
new file mode 100644
index 00000000000..907455c7c35
--- /dev/null
+++ b/paddleformers/transformers/florence2/processor.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import re
+
+from ..image_processing_utils import BatchFeature
+from ..image_utils import ImageInput
+from ..processing_utils import ProcessorMixin
+from ..tokenizer_utils_base import PreTokenizedInput, TextInput
+
+__all__ = ["Florence2Processor"]
+
+
+class Florence2Processor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+
+    @classmethod
+    def _load_tokenizer_from_pretrained(
+        cls,
+        sub_processor_type,
+        pretrained_model_name_or_path,
+        subfolder="",
+        **kwargs,
+    ):
+        kwargs.setdefault("tokenizer_type", "bart")
+        return super()._load_tokenizer_from_pretrained(
+            sub_processor_type, pretrained_model_name_or_path, subfolder=subfolder, **kwargs
+        )
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        if image_processor is None or tokenizer is None:
+            raise ValueError("Florence2Processor requires both an image processor and a tokenizer.")
+
+        tokens = (
+            ["<od>", "</od>", "<ocr>", "</ocr>"]
+            + [f"<loc_{index}>" for index in range(1000)]
+            + [
+                "<cap>",
+                "</cap>",
+                "<ncap>",
+                "</ncap>",
+                "<dcap>",
+                "</dcap>",
+                "<grounding>",
+                "</grounding>",
+                "<seg>",
+                "</seg>",
+                "<sep>",
+                "<region_cap>",
+                "</region_cap>",
+                "<region_to_desciption>",
+                "</region_to_desciption>",
+                "<proposal>",
+                "</proposal>",
+                "<poly>",
+                "</poly>",
+                "<and>",
+            ]
+        )
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": list(getattr(tokenizer, "additional_special_tokens", [])) + tokens}
+        )
+        self.image_seq_length = getattr(image_processor, "image_seq_length", 577)
+        self.task_prompts_without_inputs = {
+            "<OCR>": "What is the text in the image?",
+            "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
+            "<CAPTION>": "What does the image describe?",
+            "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
+            "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
+            "<OD>": "Locate the objects with category name in the image.",
+            "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
+            "<REGION_PROPOSAL>": "Locate the region proposals in the image.",
+        }
+        self.task_prompts_with_input = {
+            "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
+            "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
+            "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
+            "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
+            "<REGION_TO_CATEGORY>": "What is the region {input}?",
+            "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
+            "<REGION_TO_OCR>": "What text is in the region {input}?",
+        }
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def _construct_prompts(self, texts):
+        prompts = []
+        for text in texts:
+            for task, prompt in self.task_prompts_without_inputs.items():
+                if task in text:
+                    if text != task:
+                        raise ValueError(f"Task token {task} must be the only token in the prompt.")
+                    text = prompt
+                    break
+            for task, prompt in self.task_prompts_with_input.items():
+                if task in text:
+                    text = prompt.format(input=text.replace(task, ""))
+                    break
+            prompts.append(text)
+        return prompts
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None,
+        return_tensors="pd",
+        padding=False,
+        truncation=None,
+        max_length=None,
+        **kwargs,
+    ):
+        if images is None:
+            raise ValueError("`images` must be provided to Florence2Processor.")
+        texts = text if isinstance(text, list) else [text or ""]
+        if isinstance(images, list) and len(images) < len(texts):
+            raise ValueError("Each Florence-2 prompt must have an associated image.")
+
+        image_kwargs = {
+            key: value
+            for key, value in kwargs.items()
+            if key
+            in {
+                "do_resize",
+                "do_normalize",
+                "image_mean",
+                "image_std",
+                "data_format",
+                "input_data_format",
+                "resample",
+                "do_convert_rgb",
+                "do_rescale",
+            }
+            and value is not None
+        }
+        image_inputs = self.image_processor(images=images, return_tensors=return_tensors, **image_kwargs)
+        if max_length is not None:
+            max_length -= self.image_seq_length
+        text_inputs = self.tokenizer(
+            self._construct_prompts(texts),
+            return_tensors=return_tensors,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_token_type_ids=False,
+        )
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        return list(dict.fromkeys(self.tokenizer.model_input_names + self.image_processor.model_input_names))
+
+    @staticmethod
+    def _dequantize(values, image_size):
+        width, height = image_size
+        return [
+            (value + 0.5) * (width if index % 2 == 0 else height) / 1000
+            for index, value in enumerate(values)
+        ]
+
+    def post_process_generation(self, text, task, image_size):
+        clean_text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
+        if task in {
+            "<OCR>",
+            "<CAPTION>",
+            "<DETAILED_CAPTION>",
+            "<MORE_DETAILED_CAPTION>",
+            "<REGION_TO_CATEGORY>",
+            "<REGION_TO_DESCRIPTION>",
+            "<REGION_TO_OCR>",
+        }:
+            return {task: clean_text}
+        if task == "<OCR_WITH_REGION>":
+            pattern = r"(.+?)" + "".join([r"<loc_(\d+)>"] * 8)
+            matches = re.findall(pattern, clean_text)
+            return {
+                task: {
+                    "quad_boxes": [
+                        self._dequantize([int(value) for value in match[1:]], image_size) for match in matches
+                    ],
+                    "labels": [match[0] for match in matches],
+                }
+            }
+        if task in {"<REFERRING_EXPRESSION_SEGMENTATION>", "<REGION_TO_SEGMENTATION>"}:
+            polygons, labels = [], []
+            pattern = r"([^<]*)(?:<poly>)?((?:<loc_\d+>|<sep>)+)(?:</poly>)?"
+            for phrase, encoded_polygons in re.findall(pattern, clean_text):
+                instance = []
+                for encoded_polygon in encoded_polygons.split("<sep>"):
+                    values = [int(value) for value in re.findall(r"<loc_(\d+)>", encoded_polygon)]
+                    if len(values) >= 6 and len(values) % 2 == 0:
+                        instance.append(self._dequantize(values, image_size))
+                if instance:
+                    polygons.append(instance)
+                    labels.append(phrase.strip())
+            return {task: {"polygons": polygons, "labels": labels}}
+        if task == "<REGION_PROPOSAL>":
+            values = [int(value) for value in re.findall(r"<loc_(\d+)>", clean_text)]
+            bboxes = [
+                self._dequantize(values[index : index + 4], image_size)
+                for index in range(0, len(values) - 3, 4)
+            ]
+            return {task: {"bboxes": bboxes, "labels": [""] * len(bboxes)}}
+
+        phrase_pattern = r"([^<]+)((?:<loc_\d+>){4,})"
+        box_pattern = r"<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>"
+        bboxes, labels = [], []
+        for phrase, encoded_boxes in re.findall(phrase_pattern, clean_text):
+            for box in re.findall(box_pattern, encoded_boxes):
+                bboxes.append(self._dequantize([int(value) for value in box], image_size))
+                labels.append(phrase.strip())
+        return {task: {"bboxes": bboxes, "labels": labels}}
diff --git a/tests/transformers/florence2/__init__.py b/tests/transformers/florence2/__init__.py
new file mode 100644
index 00000000000..8b137891791
--- /dev/null
+++ b/tests/transformers/florence2/__init__.py
@@ -0,0 +1 @@
+
diff --git a/tests/transformers/florence2/test_modeling.py b/tests/transformers/florence2/test_modeling.py
new file mode 100644
index 00000000000..3ba1138cdc5
--- /dev/null
+++ b/tests/transformers/florence2/test_modeling.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+
+from paddleformers.generation import BeamSearchScorer, LogitsProcessorList
+from paddleformers.transformers import Florence2Config, Florence2ForConditionalGeneration
+from paddleformers.transformers.model_outputs import BaseModelOutput
+from tests.transformers.test_configuration_common import ConfigTester
+from tests.transformers.test_generation_utils import GenerationTesterMixin
+from tests.transformers.test_modeling_common import (
+    ModelTesterMixin,
+    ModelTesterPretrainedMixin,
+    floats_tensor,
+    ids_tensor,
+)
+
+
+class Florence2ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=5,
+        decoder_seq_length=4,
+        image_size=32,
+        vocab_size=100,
+        hidden_size=32,
+        encoder_layers=2,
+        decoder_layers=2,
+        num_attention_heads=4,
+        is_training=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.decoder_seq_length = decoder_seq_length
+        self.image_size = image_size
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_hidden_layers = encoder_layers
+        self.expected_num_hidden_layers = encoder_layers + 1
+        self.is_training = is_training
+
+        self.image_feature_length = 2
+        self.encoder_seq_length = seq_length + self.image_feature_length
+        self.decoder_key_length = decoder_seq_length
+
+    def get_config(self):
+        return Florence2Config(
+            vision_config={
+                "depths": [1, 1, 1, 1],
+                "dim_embed": [16, 32, 64, 128],
+                "num_heads": [2, 4, 8, 16],
+                "num_groups": [2, 4, 8, 16],
+                "window_size": 4,
+                "projection_dim": self.hidden_size,
+                "drop_path_rate": 0.0,
+                "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"],
+            },
+            text_config={
+                "vocab_size": self.vocab_size,
+                "d_model": self.hidden_size,
+                "encoder_layers": self.encoder_layers,
+                "decoder_layers": self.decoder_layers,
+                "encoder_attention_heads": self.num_attention_heads,
+                "decoder_attention_heads": self.num_attention_heads,
+                "encoder_ffn_dim": self.hidden_size * 2,
+                "decoder_ffn_dim": self.hidden_size * 2,
+                "max_position_embeddings": 128,
+                "dropout": 0.0,
+                "attention_dropout": 0.0,
+                "activation_dropout": 0.0,
+                "use_cache": True,
+            },
+            projection_dim=self.hidden_size,
+            vocab_size=self.vocab_size,
+            pad_token_id=1,
+            bos_token_id=0,
+            eos_token_id=2,
+        )
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size, dtype=paddle.int64)
+        decoder_input_ids = ids_tensor(
+            [self.batch_size, self.decoder_seq_length], self.vocab_size, dtype=paddle.int64
+        )
+        labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size, dtype=paddle.int64)
+        attention_mask = paddle.ones([self.batch_size, self.seq_length], dtype="int64")
+        pixel_values = floats_tensor([self.batch_size, 3, self.image_size, self.image_size])
+        return config, input_ids, attention_mask, decoder_input_ids, labels, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config, input_ids, attention_mask, decoder_input_ids, _, pixel_values = self.prepare_config_and_inputs()
+        inputs_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "pixel_values": pixel_values,
+            "use_cache": False,
+        }
+        return config, inputs_dict
+
+    def create_and_check_model(self, config, input_ids, attention_mask, decoder_input_ids, labels, pixel_values):
+        model = Florence2ForConditionalGeneration(config)
+        model.eval()
+
+        with paddle.no_grad():
+            outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                pixel_values=pixel_values,
+                use_cache=False,
+            )
+            loss_outputs = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                labels=labels,
+                use_cache=False,
+            )
+
+        self.parent.assertEqual(
+            list(outputs.logits.shape), [self.batch_size, self.decoder_seq_length, self.vocab_size]
+        )
+        self.parent.assertEqual(loss_outputs.loss.ndim, 0)
+
+
+class Florence2ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    base_model_class = Florence2ForConditionalGeneration
+    all_model_classes = (Florence2ForConditionalGeneration,)
+    all_generative_model_classes = {Florence2ForConditionalGeneration: (None, "florence2")}
+    is_encoder_decoder = True
+    has_attentions = False
+    test_mismatched_shapes = False
+
+    def setUp(self):
+        self.model_tester = Florence2ModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=Florence2Config,
+            common_properties=["vocab_size"],
+            vocab_size=100,
+            projection_dim=32,
+        )
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_common_properties()
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_with_num_classes()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+
+    def test_florence2_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_save_load(self):
+        super().test_save_load()
+
+    def test_determinism(self):
+        super().test_determinism()
+
+    def test_hidden_states_output(self):
+        super().test_hidden_states_output()
+
+    def test_resize_tokens_embeddings(self):
+        super().test_resize_tokens_embeddings()
+
+    def _get_generation_inputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"][:1].clone()
+        attention_mask = inputs_dict["attention_mask"][:1].clone()
+        pixel_values = inputs_dict["pixel_values"][:1].clone()
+        return config, input_ids, attention_mask, pixel_values
+
+    def test_greedy_generate(self):
+        config, input_ids, attention_mask, pixel_values = self._get_generation_inputs()
+        model = Florence2ForConditionalGeneration(config)
+        model.eval()
+
+        with paddle.no_grad():
+            generated = model.generate(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                attention_mask=attention_mask,
+                max_new_tokens=3,
+                decode_strategy="greedy_search",
+            )[0]
+
+        self.assertEqual(generated.shape[0], input_ids.shape[0])
+        self.assertGreaterEqual(generated.shape[1], 1)
+
+    def test_beam_search_generate(self):
+        config, input_ids, attention_mask, pixel_values = self._get_generation_inputs()
+        model = Florence2ForConditionalGeneration(config)
+        model.eval()
+
+        with paddle.no_grad():
+            inputs_embeds = model.get_input_embeddings()(input_ids)
+            image_features = model._encode_image(pixel_values)
+            inputs_embeds, encoder_attention_mask = model._merge_image_features(
+                image_features, inputs_embeds, attention_mask
+            )
+            encoder_output = model.get_encoder()(
+                input_ids=None,
+                attention_mask=encoder_attention_mask,
+                inputs_embeds=inputs_embeds,
+            ).last_hidden_state
+            num_beams = 2
+            decoder_input_ids = paddle.full(
+                [input_ids.shape[0] * num_beams, 1],
+                config.decoder_start_token_id,
+                dtype=input_ids.dtype,
+            )
+            beam_scorer = BeamSearchScorer(
+                batch_size=input_ids.shape[0],
+                max_length=4,
+                num_beams=num_beams,
+            )
+            generated = model.language_model.beam_search(
+                decoder_input_ids,
+                beam_scorer,
+                logits_processors=LogitsProcessorList(),
+                max_length=4,
+                diversity_rate=0.0,
+                pad_token_id=config.pad_token_id,
+                eos_token_id=config.eos_token_id,
+                encoder_output=BaseModelOutput(
+                    last_hidden_state=encoder_output.repeat_interleave(num_beams, axis=0)
+                ),
+                attention_mask=encoder_attention_mask.repeat_interleave(num_beams, axis=0),
+            )[0]
+
+        self.assertEqual(generated.shape[0], input_ids.shape[0])
+        self.assertGreaterEqual(generated.shape[1], 1)
+
+    def test_sample_generate(self):
+        config, input_ids, attention_mask, pixel_values = self._get_generation_inputs()
+        model = Florence2ForConditionalGeneration(config)
+        model.eval()
+
+        paddle.seed(1234)
+        with paddle.no_grad():
+            generated = model.generate(
+                input_ids=input_ids,
+                pixel_values=pixel_values,
+                attention_mask=attention_mask,
+                max_new_tokens=3,
+                decode_strategy="sampling",
+                top_k=10,
+            )[0]
+
+        self.assertEqual(generated.shape[0], input_ids.shape[0])
+        self.assertGreaterEqual(generated.shape[1], 1)
+
+    def test_generate_without_input_ids(self):
+        # Florence2 needs either image-conditioned embeddings or explicit text ids.
+        pass
+
+    def test_group_beam_search_generate(self):
+        # Group beam search coverage is not required for Florence2.
+        pass
+
+    def test_paddleformers_sft_labels(self):
+        model = Florence2ForConditionalGeneration(self.model_tester.get_config())
+        input_ids = paddle.to_tensor([[10, 11, 12, 20, 21, 2]])
+        labels = paddle.to_tensor([[-100, -100, 20, 21, 2, -100]])
+        source_ids, decoder_labels, source_mask = model._split_sft_inputs(input_ids, labels, None)
+        self.assertEqual(source_ids.tolist(), [[10, 11, 12]])
+        self.assertEqual(decoder_labels.tolist(), [[20, 21, 2]])
+        self.assertEqual(source_mask.tolist(), [[1, 1, 1]])
+
+
+class Florence2ModelIntegrationTest(ModelTesterPretrainedMixin, unittest.TestCase):
+    base_model_class = Florence2ForConditionalGeneration
+    hf_remote_test_model_path = None
+    paddlehub_remote_test_model_path = None
+
+    @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.")
+    def test_model_from_pretrained_paddle_hub(self):
+        pass
+
+    @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.")
+    def test_model_from_config_paddle_hub(self):
+        pass
+
+    @unittest.skip("Florence2 tiny pretrained checkpoint is not available yet.")
+    def test_pretrained_save_and_load(self):
+        pass
+
+
+class Florence2ModelLocalPretrainedTest(unittest.TestCase):
+    def test_local_save_load_consistency(self):
+        config, inputs_dict = Florence2ModelTester(self).prepare_config_and_inputs_for_common()
+        model = Florence2ForConditionalGeneration(config)
+        model.eval()
+
+        with paddle.no_grad():
+            expected = model(**inputs_dict).logits
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname, save_to_hf=False, save_checkpoint_format="")
+            loaded = Florence2ForConditionalGeneration.from_pretrained(
+                tmpdirname, convert_from_hf=False, load_checkpoint_format=""
+            )
+            loaded.eval()
+            with paddle.no_grad():
+                actual = loaded(**inputs_dict).logits
+
+        self.assertLessEqual(np.max(np.abs(expected.numpy() - actual.numpy())), 1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/transformers/florence2/test_processor.py b/tests/transformers/florence2/test_processor.py
new file mode 100644
index 00000000000..17cc4cfbfe9
--- /dev/null
+++ b/tests/transformers/florence2/test_processor.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+
+import unittest
+
+from paddleformers.datasets.template.mm_plugin import get_mm_plugin
+from paddleformers.transformers import Florence2Processor
+
+
+class Florence2ProcessorTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = object.__new__(Florence2Processor)
+        self.processor.task_prompts_without_inputs = {"<CAPTION>": "What does the image describe?"}
+        self.processor.task_prompts_with_input = {"<REGION_TO_DESCRIPTION>": "What does the region {input} describe?"}
+
+    def test_construct_prompts(self):
+        prompts = self.processor._construct_prompts(["<CAPTION>", "<REGION_TO_DESCRIPTION><loc_1><loc_2>"])
+        self.assertEqual(prompts[0], "What does the image describe?")
+        self.assertIn("<loc_1><loc_2>", prompts[1])
+
+    def test_post_process_detection(self):
+        result = self.processor.post_process_generation(
+            "cat<loc_0><loc_1><loc_998><loc_999>", "<OD>", (100, 200)
+        )
+        self.assertEqual(result["<OD>"]["labels"], ["cat"])
+        self.assertEqual(len(result["<OD>"]["bboxes"][0]), 4)
+
+    def test_post_process_segmentation(self):
+        result = self.processor.post_process_generation(
+            "cat<poly><loc_0><loc_1><loc_500><loc_501><loc_998><loc_999></poly>",
+            "<REFERRING_EXPRESSION_SEGMENTATION>",
+            (100, 200),
+        )
+        self.assertEqual(result["<REFERRING_EXPRESSION_SEGMENTATION>"]["labels"], ["cat"])
+        self.assertEqual(len(result["<REFERRING_EXPRESSION_SEGMENTATION>"]["polygons"][0][0]), 6)
+
+    def test_post_process_region_proposal(self):
+        result = self.processor.post_process_generation(
+            "<loc_0><loc_1><loc_998><loc_999>",
+            "<REGION_PROPOSAL>",
+            (100, 200),
+        )
+        self.assertEqual(len(result["<REGION_PROPOSAL>"]["bboxes"]), 1)
+
+    def test_sft_message_format(self):
+        self.processor.image_processor = object()
+        plugin = get_mm_plugin("florence2", image_token="<image>")
+        messages = [
+            {"role": "user", "content": "<image><CAPTION>"},
+            {"role": "assistant", "content": "A solid color image."},
+        ]
+        processed = plugin.process_messages(messages, ["image.jpg"], [], [], {}, self.processor)
+        self.assertEqual(processed[0]["content"], "What does the image describe?")
+        self.assertEqual(processed[1]["content"], "A solid color image.")
+        with self.assertRaisesRegex(ValueError, "exactly one image"):
+            plugin.process_messages([], [], [], [], {}, self.processor)
+
+
+if __name__ == "__main__":
+    unittest.main()