Skip to content
Open

first #4681

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
<tbody>
<!-- LLM 分类 - 跨行合并开始 -->
<tr>
<td rowspan="10" style="vertical-align: top;">LLM</td>
<td rowspan="11" style="vertical-align: top;">LLM</td>
<td>DeepSeekv3</td>
<td>deepseek-ai/DeepSeek-V3-Base、deepseek-ai/DeepSeek-V3、deepseek-ai/DeepSeek-V3-0324</td>
<td>deepseek3</td>
Expand All @@ -75,6 +75,11 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
<td>openai/gpt-oss-20b、openai/gpt-oss-120b</td>
<td>gpt</td>
</tr>
<tr>
<td>Seed-OSS</td>
<td>ByteDance-Seed/Seed-OSS-36B-Base、ByteDance-Seed/Seed-OSS-36B-Instruct</td>
<td>seed_oss</td>
</tr>
<tr>
<td>Llama-3</td>
<td>meta-llama/Meta-Llama-3-8B、meta-llama/Meta-Llama-3-8B-Instruct、meta-llama/Meta-Llama-3-70B、meta-llama/Meta-Llama-3-70B-Instruct、meta-llama/Llama-3.1-8B、meta-llama/Llama-3.1-8B-Instruct、meta-llama/Llama-3.1-70B、meta-llama/Llama-3.1-70B-Instruct、meta-llama/Llama-3.1-405B、meta-llama/Llama-3.1-405B-Instruct、meta-llama/Llama-3.2-1B、meta-llama/Llama-3.2-1B-Instruct、meta-llama/Llama-3.2-3B、meta-llama/Llama-3.2-3B-Instruct、meta-llama/Llama-3.3-70B-Instruct</td>
Expand Down
2 changes: 2 additions & 0 deletions docs/zh/model_capability.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
|Gemma3|✓|✓|✓|✓|✓|
|GLM-4.5|✓|✓|✓|✓|✓|
|GPT-OSS|✓|✓|✓|x|x|
|Seed-OSS|✓|✓|✓|x|x|
|LLaMA3|✓|✓|✓|✓|✓|
|Phi4|✓|✓|✓|✓|✓|
|Qwen2|✓|✓|✓|✓|✓|
Expand All @@ -24,6 +25,7 @@
|Gemma3|x|✓|-|x|✓|✓|
|GLM-4.5|✓|✓|✓|✓|✓|✓|
|GPT-OSS|✓|✓|x|x|✓|✓|
|Seed-OSS|✓|✓|-|x|✓|✓|
|LLaMA3|✓|✓|-|x|✓|✓|
|Phi4|✓|✓|-|x|✓|✓|
|Qwen2|✓|✓|x|x|✓|✓|
Expand Down
10 changes: 10 additions & 0 deletions paddleformers/cli/utils/llm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ def get_lora_target_modules(model):
".*up_proj.*",
".*down_proj.*",
]
elif model.config.model_type == "seed_oss":
target_modules = [
".*q_proj.*",
".*k_proj.*",
".*v_proj.*",
".*o_proj.*",
".*gate_proj.*",
".*up_proj.*",
".*down_proj.*",
]
elif model.config.model_type == "gemma3_text":
target_modules = [
".*q_proj.*",
Expand Down
20 changes: 20 additions & 0 deletions paddleformers/datasets/template/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,26 @@ def _get_gpt_oss_prefix():
template_class=Template,
)

register_template(
name="seed_oss",
format_user=StringFormatter(
slots=[
"<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n"
"<|start_of_role|>assistant<|end_of_role|>"
]
),
format_assistant=StringFormatter(slots=["{{content}}"]),
format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
format_observation=StringFormatter(
slots=[
"<|start_of_role|>tool_response<|end_of_role|>{{content}}<|end_of_text|>\n"
"<|start_of_role|>assistant<|end_of_role|>"
]
),
chat_sep="<|end_of_text|>\n",
suffix=["<|end_of_text|>"],
)

register_template(
name="llama3",
format_user=StringFormatter(
Expand Down
11 changes: 11 additions & 0 deletions paddleformers/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,16 @@
"paddleocr_vl.processor": ["PaddleOCRVLProcessor"],
"gpt_oss.configuration": ["GptOssConfig"],
"gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"],
"seed_oss.configuration": ["SeedOssConfig"],
"seed_oss.modeling": [
"SeedOssModel",
"SeedOssPretrainedModel",
"SeedOssForCausalLM",
"SeedOssForCausalLMPipe",
"SeedOssForSequenceClassification",
"SeedOssForTokenClassification",
"SeedOssForQuestionAnswering",
],
"kimi_k25.vision_processor": ["KimiK25VisionProcessor"],
"kimi_k25.processor": ["KimiK25Processor"],
"kimi_k25.tokenizer": ["TikTokenTokenizer"],
Expand Down Expand Up @@ -415,6 +425,7 @@
from .minimax_m2 import *
from .deepseek_v4 import *
from .gpt_oss import *
from .seed_oss import *
from .phi3 import *
from .gemma3_text import *
from .glm_ocr import *
Expand Down
2 changes: 2 additions & 0 deletions paddleformers/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
("minimax_m2", "MiniMaxM2Config"),
("deepseek_v4", "DeepseekV4Config"),
("gpt_oss", "GptOssConfig"),
("seed_oss", "SeedOssConfig"),
("phi3", "Phi3Config"),
("gemma3_text", "Gemma3TextConfig"),
("glm4v_moe", "Glm4vMoeConfig"),
Expand Down Expand Up @@ -88,6 +89,7 @@
("qwen3_vl_text", "Qwen3VL"),
("qwen3_vl_moe", "Qwen3VLMoe"),
("qwen3_vl_moe_text", "Qwen3VLMoeText"),
("seed_oss", "SeedOssModel"),
("glm_ocr", "GlmOcrForConditionalGeneration"),
("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
("qwen3_5", "Qwen3_5ForConditionalGeneration"),
Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
("MiniMaxM2", "minimax_m2"),
("DeepseekV4", "deepseek_v4"),
("GptOss", "gpt_oss"),
("SeedOss", "seed_oss"),
("Phi3", "phi3"),
("Gemma3", "gemma3_text"),
("Glm4vMoe", "glm4v_moe"),
Expand Down
42 changes: 42 additions & 0 deletions paddleformers/transformers/seed_oss/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
from typing import TYPE_CHECKING

from ...utils.lazy_import import _LazyModule

import_structure = {
"configuration": ["SeedOssConfig"],
"modeling": [
"SeedOssModel",
"SeedOssPretrainedModel",
"SeedOssForCausalLM",
"SeedOssForCausalLMPipe",
"SeedOssForSequenceClassification",
"SeedOssForTokenClassification",
"SeedOssForQuestionAnswering",
],
}

if TYPE_CHECKING:
from .configuration import *
from .modeling import *
else:
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
import_structure,
module_spec=__spec__,
)
121 changes: 121 additions & 0 deletions paddleformers/transformers/seed_oss/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""SeedOss model configuration."""

from ..configuration_utils import PretrainedConfig
from ..modeling_rope_utils import rope_config_validation, standardize_rope_params


class SeedOssConfig(PretrainedConfig):
model_type = "seed_oss"
keys_to_ignore_at_inference = ["past_key_values"]

def __init__(
self,
vocab_size=155136,
hidden_size=4096,
intermediate_size=27648,
num_hidden_layers=64,
num_attention_heads=80,
num_key_value_heads=8,
hidden_act="silu",
max_position_embeddings=524288,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_parameters=None,
rope_theta=10000000.0,
attention_bias=True,
attention_out_bias=False,
attention_dropout=0.1,
residual_dropout=0.1,
mlp_bias=False,
head_dim=None,
use_bias=False,
fuse_rms_norm=False,
ignored_index=-100,
pp_seg_method="layer:SeedOssDecoderLayer",
dpo_config=None,
kto_config=None,
**kwargs,
):
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_attention_heads if num_key_value_heads is None else num_key_value_heads
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.pretraining_tp = pretraining_tp
self.tie_word_embeddings = tie_word_embeddings
self.attention_bias = attention_bias
self.attention_out_bias = attention_out_bias
self.attention_dropout = attention_dropout
self.residual_dropout = residual_dropout
self.mlp_bias = mlp_bias
self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
self.use_bias = use_bias
self.fuse_rms_norm = fuse_rms_norm
self.ignored_index = ignored_index
self.pp_seg_method = pp_seg_method
self.dpo_config = dpo_config
self.kto_config = kto_config

self.rope_scaling = kwargs.pop("rope_scaling", None)
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]

if rope_parameters is None:
rope_parameters = self.rope_scaling
if rope_parameters is None:
rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
elif "rope_theta" not in rope_parameters:
rope_parameters = dict(rope_parameters)
rope_parameters["rope_theta"] = rope_theta

self.rope_parameters = rope_parameters
self.rope_theta = rope_parameters.get("rope_theta", rope_theta)

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)

self.register_unsavable_keys(
[
"ignored_index",
"pp_seg_method",
"dpo_config",
"kto_config",
]
)

standardize_rope_params(self, rope_theta=self.rope_theta)
rope_config_validation(self)


__all__ = ["SeedOssConfig"]
Loading
Loading