Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/model-unittest-gpu-ce-develop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ on:
- 'qwen2'
- 'gemma3_text'
- 'paddleocr_vl'
- 'florence2'
FLAGS_enable_CE:
required: false
default: 'CE_Develop_cu130_py312'
Expand Down Expand Up @@ -455,4 +456,4 @@ jobs:
echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY
echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY
echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY
echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
3 changes: 2 additions & 1 deletion .github/workflows/model-unittest-gpu-ce-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ on:
- 'qwen2'
- 'gemma3_text'
- 'paddleocr_vl'
- 'florence2'
FLAGS_enable_CE:
required: false
default: 'CE_Release_cu129_py312_nightly'
Expand Down Expand Up @@ -483,4 +484,4 @@ jobs:
echo "| Workflow | ${{ github.workflow }} |" >> $GITHUB_STEP_SUMMARY
echo "| CE Mode | $MODE_$FLAGS_enable_CE |" >> $GITHUB_STEP_SUMMARY
echo "| Time | $(date +%Y%m%d) |" >> $GITHUB_STEP_SUMMARY
echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
echo "| Report | [Open Report](${PAGES_URL}) |" >> $GITHUB_STEP_SUMMARY
3 changes: 2 additions & 1 deletion .github/workflows/model-unittest-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ on:
- 'qwen2'
- 'gemma3_text'
- 'paddleocr_vl'
- 'florence2'
- 'qwen2_moe'
- 'qwen3_vl'
- 'qwen3_vl_moe'
Expand Down Expand Up @@ -488,4 +489,4 @@ jobs:
else:
res = gh("POST", f"{base}/issues/{pr_number}/comments", {"body": comment})
print(f"Created comment: {res.get('html_url')}")
PYEOF
PYEOF
48 changes: 48 additions & 0 deletions examples/config/sft-vl/florence2_full_300_steps.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
### data
# JSONL example:
# {"messages":[{"role":"user","content":"<image><CAPTION>"},{"role":"assistant","content":"A cat."}],
# "images":["/path/to/image.jpg"]}
train_dataset_type: messages
eval_dataset_type: messages
train_dataset_path: ./florence2_train.jsonl
train_dataset_prob: "1.0"
eval_dataset_path: ./florence2_train.jsonl
eval_dataset_prob: "1.0"
max_seq_len: 1024
packing: false
mix_strategy: concat
template_backend: custom
template: florence2

### model
model_name_or_path: /home/housaijie/code/Florence-2-base
continue_training: true

### finetuning
stage: VL-SFT
fine_tuning: full
seed: 23
do_train: true
do_eval: false
per_device_train_batch_size: 1
max_steps: 300
save_strategy: "no"
logging_steps: 1
gradient_accumulation_steps: 1
output_dir: ./checkpoints/florence2-sft-full
disable_tqdm: true

### train
warmup_steps: 0
learning_rate: 1.0e-5

### performance
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
bf16: true
fp16: false
fp16_opt_level: O1
convert_from_hf: true
unified_checkpoint: false
save_checkpoint_format: "flex_checkpoint"
load_checkpoint_format: ""
29 changes: 29 additions & 0 deletions paddleformers/datasets/template/mm_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,34 @@ def get_mm_inputs(
return self._get_mm_inputs(images, videos, audios, processor, **kwargs)


@dataclass
class Florence2Plugin(BasePlugin):
@override
def process_messages(
self,
messages,
images,
videos,
audios,
mm_inputs,
processor,
):
self._validate_input(processor, images, videos, audios)
self._validate_messages(messages, images, videos, audios)
if videos or audios:
raise ValueError("Florence-2 only supports image inputs.")
if len(images) != 1:
raise ValueError("Florence-2 supports exactly one image per prompt.")

messages = deepcopy(messages)
for message in messages:
content = message["content"].replace(IMAGE_PLACEHOLDER, "").strip()
if message["role"] == "user" and hasattr(processor, "_construct_prompts"):
content = processor._construct_prompts([content])[0]
message["content"] = content
return messages


@dataclass
class PaddleOCRVLPlugin(BasePlugin):
image_bos_token: str = "<|IMAGE_START|>"
Expand Down Expand Up @@ -1496,6 +1524,7 @@ def process_messages(

PLUGINS = {
"base": BasePlugin,
"florence2": Florence2Plugin,
"ernie_vl": ErnieVLPlugin,
"qwen2_vl": Qwen2VLPlugin,
"paddleocr_vl": PaddleOCRVLPlugin,
Expand Down
10 changes: 10 additions & 0 deletions paddleformers/datasets/template/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,16 @@ def get_template_and_fix_tokenizer(dataset_config) -> "Template":
mm_plugin=get_mm_plugin(name="paddleocr_vl", image_token="<|IMAGE_PLACEHOLDER|>"),
)

register_template(
name="florence2",
format_user=StringFormatter(slots=["{{content}}"]),
format_assistant=StringFormatter(slots=["{{content}}"]),
format_system=StringFormatter(slots=["{{content}}"]),
format_prefix=EmptyFormatter(slots=[]),
suffix=["</s>"],
mm_plugin=get_mm_plugin(name="florence2", image_token="<image>"),
)

# copied from chatml template
register_template(
name="qwen",
Expand Down
9 changes: 9 additions & 0 deletions paddleformers/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,14 @@
"qwen2_vl.processor": ["Qwen2VLProcessor"],
"qwen2_vl.video_processor": ["Qwen2VLVideoProcessor"],
"qwen2_vl.vision_process": ["process_vision_info"],
"florence2.configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"],
"florence2.image_processor": ["Florence2ImageProcessor"],
"florence2.modeling": [
"Florence2ForConditionalGeneration",
"Florence2LanguageForConditionalGeneration",
"Florence2VisionModel",
],
"florence2.processor": ["Florence2Processor"],
"qwen3.configuration": ["Qwen3Config"],
"qwen3.modeling": [
"Qwen3Model",
Expand Down Expand Up @@ -290,6 +298,7 @@
"llama": [],
"qwen2": [],
"glm_ocr": [],
"florence2": [],
"qwen3": [],
"deepseek_v3": [],
"ernie4_5": ["Ernie4_5DecoderLayer", "Ernie4_5Model", "Ernie4_5_ForCausalLM"],
Expand Down
2 changes: 2 additions & 0 deletions paddleformers/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
("gpt_oss", "GptOssConfig"),
("phi3", "Phi3Config"),
("gemma3_text", "Gemma3TextConfig"),
("florence2", "Florence2Config"),
("glm4v_moe", "Glm4vMoeConfig"),
("glm_ocr", "GlmOcrConfig"),
("qwen3_5", "Qwen3_5Config"),
Expand Down Expand Up @@ -88,6 +89,7 @@
("qwen3_vl_text", "Qwen3VL"),
("qwen3_vl_moe", "Qwen3VLMoe"),
("qwen3_vl_moe_text", "Qwen3VLMoeText"),
("florence2", "Florence2ForConditionalGeneration"),
("glm_ocr", "GlmOcrForConditionalGeneration"),
("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
("qwen3_5", "Qwen3_5ForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/image_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"glm4v_moe": ("Glm4vImageProcessor", "Glm4vImageProcessorFast"),
"kimi_k25": ("KimiK25VisionProcessor"),
"paddleocr_vl": ("PaddleOCRVLImageProcessor"),
"florence2": ("Florence2ImageProcessor"),
"qwen2_5_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
"qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
"qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"),
Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
("GptOss", "gpt_oss"),
("Phi3", "phi3"),
("Gemma3", "gemma3_text"),
("Florence2", "florence2"),
("Glm4vMoe", "glm4v_moe"),
("GlmOcr", "glm_ocr"),
]
Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
("qwen2_vl", "Qwen2VLProcessor"),
("qwen3_omni_moe", "Qwen3OmniMoeProcessor"),
("paddleocr_vl", "PaddleOCRVLProcessor"),
("florence2", "Florence2Processor"),
("ernie4_5_moe_vl", "Ernie4_5_VLProcessor"),
("glm4v_moe", "Glm4vProcessor"),
("glm_ocr", "Glm46VProcessor"),
Expand Down
28 changes: 28 additions & 0 deletions paddleformers/transformers/florence2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");

import sys
from typing import TYPE_CHECKING

from ...utils.lazy_import import _LazyModule

import_structure = {
"configuration": ["Florence2Config", "Florence2LanguageConfig", "Florence2VisionConfig"],
"image_processor": ["Florence2ImageProcessor"],
"modeling": [
"Florence2ForConditionalGeneration",
"Florence2LanguageForConditionalGeneration",
"Florence2VisionModel",
],
"processor": ["Florence2Processor"],
}

if TYPE_CHECKING:
from .configuration import *
from .image_processor import *
from .modeling import *
from .processor import *
else:
sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], import_structure, module_spec=__spec__)

Loading
Loading