PaddlePaddle · Minestar6 · Jun 17, 2026
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
   <tbody>
     <!-- LLM 分类 - 跨行合并开始 -->
     <tr>
-      <td rowspan="10" style="vertical-align: top;">LLM</td>
+      <td rowspan="11" style="vertical-align: top;">LLM</td>
       <td>DeepSeekv3</td>
       <td>deepseek-ai/DeepSeek-V3-Base、deepseek-ai/DeepSeek-V3、deepseek-ai/DeepSeek-V3-0324</td>
       <td>deepseek3</td>
@@ -75,6 +75,11 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
       <td>openai/gpt-oss-20b、openai/gpt-oss-120b</td>
       <td>gpt</td>
     </tr>
+    <tr>
+      <td>Seed-OSS</td>
+      <td>ByteDance-Seed/Seed-OSS-36B-Base、ByteDance-Seed/Seed-OSS-36B-Instruct</td>
+      <td>seed_oss</td>
+    </tr>
     <tr>
       <td>Llama-3</td>
       <td>meta-llama/Meta-Llama-3-8B、meta-llama/Meta-Llama-3-8B-Instruct、meta-llama/Meta-Llama-3-70B、meta-llama/Meta-Llama-3-70B-Instruct、meta-llama/Llama-3.1-8B、meta-llama/Llama-3.1-8B-Instruct、meta-llama/Llama-3.1-70B、meta-llama/Llama-3.1-70B-Instruct、meta-llama/Llama-3.1-405B、meta-llama/Llama-3.1-405B-Instruct、meta-llama/Llama-3.2-1B、meta-llama/Llama-3.2-1B-Instruct、meta-llama/Llama-3.2-3B、meta-llama/Llama-3.2-3B-Instruct、meta-llama/Llama-3.3-70B-Instruct</td>

diff --git a/docs/zh/model_capability.md b/docs/zh/model_capability.md
@@ -6,6 +6,7 @@
 |Gemma3|✓|✓|✓|✓|✓|
 |GLM-4.5|✓|✓|✓|✓|✓|
 |GPT-OSS|✓|✓|✓|x|x|
+|Seed-OSS|✓|✓|✓|x|x|
 |LLaMA3|✓|✓|✓|✓|✓|
 |Phi4|✓|✓|✓|✓|✓|
 |Qwen2|✓|✓|✓|✓|✓|
@@ -24,6 +25,7 @@
 |Gemma3|x|✓|-|x|✓|✓|
 |GLM-4.5|✓|✓|✓|✓|✓|✓|
 |GPT-OSS|✓|✓|x|x|✓|✓|
+|Seed-OSS|✓|✓|-|x|✓|✓|
 |LLaMA3|✓|✓|-|x|✓|✓|
 |Phi4|✓|✓|-|x|✓|✓|
 |Qwen2|✓|✓|x|x|✓|✓|

diff --git a/paddleformers/cli/utils/llm_utils.py b/paddleformers/cli/utils/llm_utils.py
@@ -288,6 +288,16 @@ def get_lora_target_modules(model):
             ".*up_proj.*",
             ".*down_proj.*",
         ]
+    elif model.config.model_type == "seed_oss":
+        target_modules = [
+            ".*q_proj.*",
+            ".*k_proj.*",
+            ".*v_proj.*",
+            ".*o_proj.*",
+            ".*gate_proj.*",
+            ".*up_proj.*",
+            ".*down_proj.*",
+        ]
     elif model.config.model_type == "gemma3_text":
         target_modules = [
             ".*q_proj.*",

diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
@@ -922,6 +922,26 @@ def _get_gpt_oss_prefix():
     template_class=Template,
 )
 
+register_template(
+    name="seed_oss",
+    format_user=StringFormatter(
+        slots=[
+            "<|start_of_role|>user<|end_of_role|>{{content}}<|end_of_text|>\n"
+            "<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}"]),
+    format_system=StringFormatter(slots=["<|start_of_role|>system<|end_of_role|>{{content}}<|end_of_text|>\n"]),
+    format_observation=StringFormatter(
+        slots=[
+            "<|start_of_role|>tool_response<|end_of_role|>{{content}}<|end_of_text|>\n"
+            "<|start_of_role|>assistant<|end_of_role|>"
+        ]
+    ),
+    chat_sep="<|end_of_text|>\n",
+    suffix=["<|end_of_text|>"],
+)
+
 register_template(
     name="llama3",
     format_user=StringFormatter(

diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -163,6 +163,16 @@
     "paddleocr_vl.processor": ["PaddleOCRVLProcessor"],
     "gpt_oss.configuration": ["GptOssConfig"],
     "gpt_oss.modeling": ["GptOssModel", "GptOssForCausalLM", "GptOssForCausalLMPipe"],
+    "seed_oss.configuration": ["SeedOssConfig"],
+    "seed_oss.modeling": [
+        "SeedOssModel",
+        "SeedOssPretrainedModel",
+        "SeedOssForCausalLM",
+        "SeedOssForCausalLMPipe",
+        "SeedOssForSequenceClassification",
+        "SeedOssForTokenClassification",
+        "SeedOssForQuestionAnswering",
+    ],
     "kimi_k25.vision_processor": ["KimiK25VisionProcessor"],
     "kimi_k25.processor": ["KimiK25Processor"],
     "kimi_k25.tokenizer": ["TikTokenTokenizer"],
@@ -415,6 +425,7 @@
     from .minimax_m2 import *
     from .deepseek_v4 import *
     from .gpt_oss import *
+    from .seed_oss import *
     from .phi3 import *
     from .gemma3_text import *
     from .glm_ocr import *

diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
@@ -57,6 +57,7 @@
         ("minimax_m2", "MiniMaxM2Config"),
         ("deepseek_v4", "DeepseekV4Config"),
         ("gpt_oss", "GptOssConfig"),
+        ("seed_oss", "SeedOssConfig"),
         ("phi3", "Phi3Config"),
         ("gemma3_text", "Gemma3TextConfig"),
         ("glm4v_moe", "Glm4vMoeConfig"),
@@ -88,6 +89,7 @@
         ("qwen3_vl_text", "Qwen3VL"),
         ("qwen3_vl_moe", "Qwen3VLMoe"),
         ("qwen3_vl_moe_text", "Qwen3VLMoeText"),
+        ("seed_oss", "SeedOssModel"),
         ("glm_ocr", "GlmOcrForConditionalGeneration"),
         ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),

diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
@@ -76,6 +76,7 @@
         ("MiniMaxM2", "minimax_m2"),
         ("DeepseekV4", "deepseek_v4"),
         ("GptOss", "gpt_oss"),
+        ("SeedOss", "seed_oss"),
         ("Phi3", "phi3"),
         ("Gemma3", "gemma3_text"),
         ("Glm4vMoe", "glm4v_moe"),

diff --git a/paddleformers/transformers/seed_oss/__init__.py b/paddleformers/transformers/seed_oss/__init__.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "configuration": ["SeedOssConfig"],
+    "modeling": [
+        "SeedOssModel",
+        "SeedOssPretrainedModel",
+        "SeedOssForCausalLM",
+        "SeedOssForCausalLMPipe",
+        "SeedOssForSequenceClassification",
+        "SeedOssForTokenClassification",
+        "SeedOssForQuestionAnswering",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .configuration import *
+    from .modeling import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
diff --git a/paddleformers/transformers/seed_oss/configuration.py b/paddleformers/transformers/seed_oss/configuration.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SeedOss model configuration."""
+
+from ..configuration_utils import PretrainedConfig
+from ..modeling_rope_utils import rope_config_validation, standardize_rope_params
+
+
+class SeedOssConfig(PretrainedConfig):
+    model_type = "seed_oss"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=155136,
+        hidden_size=4096,
+        intermediate_size=27648,
+        num_hidden_layers=64,
+        num_attention_heads=80,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=524288,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_parameters=None,
+        rope_theta=10000000.0,
+        attention_bias=True,
+        attention_out_bias=False,
+        attention_dropout=0.1,
+        residual_dropout=0.1,
+        mlp_bias=False,
+        head_dim=None,
+        use_bias=False,
+        fuse_rms_norm=False,
+        ignored_index=-100,
+        pp_seg_method="layer:SeedOssDecoderLayer",
+        dpo_config=None,
+        kto_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_attention_heads if num_key_value_heads is None else num_key_value_heads
+        self.hidden_act = hidden_act
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.pretraining_tp = pretraining_tp
+        self.tie_word_embeddings = tie_word_embeddings
+        self.attention_bias = attention_bias
+        self.attention_out_bias = attention_out_bias
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else hidden_size // num_attention_heads
+        self.use_bias = use_bias
+        self.fuse_rms_norm = fuse_rms_norm
+        self.ignored_index = ignored_index
+        self.pp_seg_method = pp_seg_method
+        self.dpo_config = dpo_config
+        self.kto_config = kto_config
+
+        self.rope_scaling = kwargs.pop("rope_scaling", None)
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+
+        if rope_parameters is None:
+            rope_parameters = self.rope_scaling
+        if rope_parameters is None:
+            rope_parameters = {"rope_type": "default", "rope_theta": rope_theta}
+        elif "rope_theta" not in rope_parameters:
+            rope_parameters = dict(rope_parameters)
+            rope_parameters["rope_theta"] = rope_theta
+
+        self.rope_parameters = rope_parameters
+        self.rope_theta = rope_parameters.get("rope_theta", rope_theta)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+        self.register_unsavable_keys(
+            [
+                "ignored_index",
+                "pp_seg_method",
+                "dpo_config",
+                "kto_config",
+            ]
+        )
+
+        standardize_rope_params(self, rope_theta=self.rope_theta)
+        rope_config_validation(self)
+
+
+__all__ = ["SeedOssConfig"]