PaddlePaddle · yicycyc · Jun 12, 2026
diff --git a/README.md b/README.md
@@ -26,7 +26,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
 结合业界主流优化方法与飞桨在业务实践中积累的高效特性，PaddleFormers 致力于打造**高性能、低资源占用**的训练体验，帮助用户高效便捷地完成大模型训练，而无需关注底层复杂的优化细节。
 
 ## 🆕最新更新
-* 2026.03.31 - PaddleFormers v1.1 正式发布！在这个版本中我们支持了 GLM-4.5 系列模型的单步与多步 MTP 训练能力。依托 MTP 架构优势，开发者可显著提升推理效率；同时针对 MTP 模块训练场景，我们新增主干网络冻结开关，灵活满足各类模型精细化调优需求。此外，我们对视觉理解类模型进行了深度优化，Qwen3-VL 30B-A3B 模型性能相比上个版本提升48%，领先Megatron-LM 6%。
+* 2026.03.31 - PaddleFormers v1.1 正式发布！在这个版本中我们支持了 GLM-4.5 系列模型的单步与多步 MTP 训练能力。依托 MTP 架构优势，开发者可显著提升推理效率；同时针对 MTP 模块训练场景，我们新增主干网络冻结开关，灵活满足各类模型精细化调优需求。此外，我们对视觉理解类模型进行了深度优化，Qwen3-VL 30B-A3B 模型性能相比上个版本提升48%，领先 Megatron-LM 6%。
 * 2026.01.21 - PaddleFomers v1.0版本发布啦！我们提供了针对 LLM 和 VLM 等模型的训练能力，针对 DeepSeek-V3模型和 GLM-4.5-Air 等重点模型，我们实现了极致性能优化（训练性能明显超越 Megatron-LM ）。针对 PaddleOCR-VL，我们在昆仑芯 P800、天数天垓150等国产计算芯片上进行了适配，更好的满足国内用户需求。
 
 ## ✨特性
@@ -50,7 +50,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
   <tbody>
     <!-- LLM 分类 - 跨行合并开始 -->
     <tr>
-      <td rowspan="10" style="vertical-align: top;">LLM</td>
+      <td rowspan="11" style="vertical-align: top;">LLM</td>
       <td>DeepSeekv3</td>
       <td>deepseek-ai/DeepSeek-V3-Base、deepseek-ai/DeepSeek-V3、deepseek-ai/DeepSeek-V3-0324</td>
       <td>deepseek3</td>
@@ -80,6 +80,11 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
       <td>meta-llama/Meta-Llama-3-8B、meta-llama/Meta-Llama-3-8B-Instruct、meta-llama/Meta-Llama-3-70B、meta-llama/Meta-Llama-3-70B-Instruct、meta-llama/Llama-3.1-8B、meta-llama/Llama-3.1-8B-Instruct、meta-llama/Llama-3.1-70B、meta-llama/Llama-3.1-70B-Instruct、meta-llama/Llama-3.1-405B、meta-llama/Llama-3.1-405B-Instruct、meta-llama/Llama-3.2-1B、meta-llama/Llama-3.2-1B-Instruct、meta-llama/Llama-3.2-3B、meta-llama/Llama-3.2-3B-Instruct、meta-llama/Llama-3.3-70B-Instruct</td>
       <td>llama3</td>
     </tr>
+    <tr>
+      <td>MiniMax-Text-01</td>
+      <td>MiniMaxAI/MiniMax-Text-01</td>
+      <td>minimax</td>
+    </tr>
     <tr>
       <td>phi-4</td>
       <td>microsoft/phi-4</td>

diff --git a/docs/zh/model_capability.md b/docs/zh/model_capability.md
@@ -7,6 +7,7 @@
 |GLM-4.5|✓|✓|✓|✓|✓|
 |GPT-OSS|✓|✓|✓|x|x|
 |LLaMA3|✓|✓|✓|✓|✓|
+|MiniMax-Text-01|✓|✓|✓|x|x|
 |Phi4|✓|✓|✓|✓|✓|
 |Qwen2|✓|✓|✓|✓|✓|
 |Qwen3|✓|✓|✓|✓|✓|
@@ -25,6 +26,7 @@
 |GLM-4.5|✓|✓|✓|✓|✓|✓|
 |GPT-OSS|✓|✓|x|x|✓|✓|
 |LLaMA3|✓|✓|-|x|✓|✓|
+|MiniMax-Text-01|x|x|x|x|✓|✓|
 |Phi4|✓|✓|-|x|✓|✓|
 |Qwen2|✓|✓|x|x|✓|✓|
 |Qwen3|✓|✓|✓|✓|✓|✓|

diff --git a/paddleformers/datasets/template/template.py b/paddleformers/datasets/template/template.py
@@ -638,6 +638,22 @@ def get_template_and_fix_tokenizer(dataset_config) -> "Template":
     thought_words=("<think>\n", "\n</think>\n\n"),
 )
 
+register_template(
+    name="minimax",
+    format_user=StringFormatter(
+        slots=[
+            "<beginning_of_sentence>user name=user\n{{content}}<end_of_sentence>\n"
+            "<beginning_of_sentence>ai name=assistant\n"
+        ]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}<end_of_sentence>"]),
+    format_system=StringFormatter(
+        slots=["<beginning_of_sentence>system ai_setting=assistant\n{{content}}<end_of_sentence>\n"]
+    ),
+    chat_sep="<end_of_sentence>\n",
+    suffix=["<end_of_sentence>"],
+)
+
 register_template(
     name="paddleocr_vl",
     format_user=StringFormatter(slots=["User: {{content}}\nAssistant: "]),

diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -313,6 +313,8 @@
     "minimax_m2": ["MiniMaxM2ForCausalLMPipe", "MiniMaxM2ForCausalLM"],
     "deepseek_v4.configuration": ["DeepseekV4Config"],
     "deepseek_v4": ["DeepseekV4ForCausalLMPipe", "DeepseekV4ForCausalLM"],
+    "minimax.configuration": ["MiniMaxConfig"],
+    "minimax": ["MiniMaxModel", "MiniMaxForCausalLM", "MiniMaxForCausalLMPipe"],
     "glm4v_moe.image_processor": ["Glm4vImageProcessor"],
     "glm4v_moe.image_processor_fast": ["Glm4vImageProcessorFast"],
     "auto": ["AutoModelForCausalLM"],
@@ -414,6 +416,7 @@
     from .glm_moe_dsa import *
     from .minimax_m2 import *
     from .deepseek_v4 import *
+    from .minimax import *
     from .gpt_oss import *
     from .phi3 import *
     from .gemma3_text import *

diff --git a/paddleformers/transformers/auto/configuration.py b/paddleformers/transformers/auto/configuration.py
@@ -54,6 +54,7 @@
         ("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"),
         ("glm4_moe", "Glm4MoeConfig"),
         ("glm_moe_dsa", "GlmMoeDsaConfig"),
+        ("minimax", "MiniMaxConfig"),
         ("minimax_m2", "MiniMaxM2Config"),
         ("deepseek_v4", "DeepseekV4Config"),
         ("gpt_oss", "GptOssConfig"),
@@ -89,6 +90,7 @@
         ("qwen3_vl_moe", "Qwen3VLMoe"),
         ("qwen3_vl_moe_text", "Qwen3VLMoeText"),
         ("glm_ocr", "GlmOcrForConditionalGeneration"),
+        ("minimax", "MiniMaxForCausalLM"),
         ("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
         ("qwen3_5", "Qwen3_5ForConditionalGeneration"),
     ]

diff --git a/paddleformers/transformers/auto/modeling.py b/paddleformers/transformers/auto/modeling.py
@@ -73,6 +73,7 @@
         ("Qwen3_5", "qwen3_5"),
         ("Glm4Moe", "glm4_moe"),
         ("GlmMoeDsa", "glm_moe_dsa"),
+        ("MiniMax", "minimax"),
         ("MiniMaxM2", "minimax_m2"),
         ("DeepseekV4", "deepseek_v4"),
         ("GptOss", "gpt_oss"),

diff --git a/paddleformers/transformers/minimax/__init__.py b/paddleformers/transformers/minimax/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Package"""
+import sys
+from typing import TYPE_CHECKING
+
+from ...utils.lazy_import import _LazyModule
+
+import_structure = {
+    "configuration": ["MiniMaxConfig"],
+    "modeling": [
+        "MiniMaxPretrainedModel",
+        "MiniMaxModel",
+        "MiniMaxForCausalLM",
+        "MiniMaxForCausalLMPipe",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .configuration import *
+    from .modeling import *
+else:
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        import_structure,
+        module_spec=__spec__,
+    )
diff --git a/paddleformers/transformers/minimax/configuration.py b/paddleformers/transformers/minimax/configuration.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" MiniMax (Text-01) model configuration"""
+
+from ..configuration_utils import PretrainedConfig
+from ..modeling_rope_utils import rope_config_validation, standardize_rope_params
+
+
+class MiniMaxConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate a
+    MiniMax (Text-01) model according to the specified arguments, defining the model architecture.
+
+    MiniMax (Text-01) uses a hybrid attention mechanism:
+    - Some layers are full attention (standard causal self-attention with RoPE)
+    - Some layers are linear attention ("lightning attention") with intra-/inter-block attention
+
+    The layer type is controlled by `layer_types`. By default, the pattern is alternating
+    `full_attention` and `linear_attention` (full on odd-indexed layers, linear on even).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 200064):
+            Vocabulary size of the MiniMax model.
+        hidden_size (`int`, *optional*, defaults to 6144):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 9216):
+            Dimension of the routed expert MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            Number of key_value heads for implementing Grouped Query Attention.
+        head_dim (`int`, *optional*):
+            Dimension of each attention head. If None, defaults to hidden_size // num_attention_heads.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 10240000):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            Number of selected experts per token.
+        num_local_experts (`int`, *optional*, defaults to 32):
+            Number of routed experts.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether the router logits should be returned by the model.
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            Coefficient for the auxiliary load-balancing loss.
+        router_jitter_noise (`float`, *optional*, defaults to 0.0):
+            Jitter noise for the router.
+        layer_types (`list[str]`, *optional*):
+            A list that maps each layer index to its attention type. Can be `"full_attention"` or `"linear_attention"`.
+        block_size (`int`, *optional*, defaults to 256):
+            The length of each attention block for the lightning attention.
+        full_attn_alpha_factor (`float`, *optional*, defaults to 1):
+            Weight for residual value in residual connection after full attention.
+        full_attn_beta_factor (`float`, *optional*, defaults to 1):
+            Weight for hidden state value in residual connection after full attention.
+        linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
+            Weight for residual value in residual connection after lightning attention.
+        linear_attn_beta_factor (`float`, *optional*, defaults to 1):
+            Weight for hidden state value in residual connection after lightning attention.
+        mlp_alpha_factor (`float`, *optional*, defaults to 1):
+            Weight for residual value in residual connection after MLP.
+        mlp_beta_factor (`float`, *optional*, defaults to 1):
+            Weight for hidden state value in residual connection after MLP.
+
+    ```python
+    >>> from paddleformers.transformers import MiniMaxModel, MiniMaxConfig
+
+    >>> # Initializing a MiniMax (Text-01) style configuration
+    >>> configuration = MiniMaxConfig()
+
+    >>> # Initializing a model from the MiniMax (Text-01) style configuration
+    >>> model = MiniMaxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "minimax"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=200064,
+        hidden_size=6144,
+        intermediate_size=9216,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        head_dim=None,
+        hidden_act="silu",
+        max_position_embeddings=10240000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=None,
+        eos_token_id=None,
+        tie_word_embeddings=False,
+        sliding_window=None,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=32,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        router_jitter_noise=0.0,
+        attn_type_list=None,
+        rope_theta=10000000.0,
+        rope_scaling=None,
+        layer_types=None,
+        block_size=256,
+        full_attn_alpha_factor=1.0,
+        full_attn_beta_factor=1.0,
+        linear_attn_alpha_factor=1.0,
+        linear_attn_beta_factor=1.0,
+        mlp_alpha_factor=1.0,
+        mlp_beta_factor=1.0,
+        **kwargs,
+    ):
+        full_attn_alpha_factor = kwargs.pop("layernorm_full_attention_alpha", full_attn_alpha_factor)
+        full_attn_beta_factor = kwargs.pop("layernorm_full_attention_beta", full_attn_beta_factor)
+        linear_attn_alpha_factor = kwargs.pop("layernorm_linear_attention_alpha", linear_attn_alpha_factor)
+        linear_attn_beta_factor = kwargs.pop("layernorm_linear_attention_beta", linear_attn_beta_factor)
+        mlp_alpha_factor = kwargs.pop("layernorm_mlp_alpha", mlp_alpha_factor)
+        mlp_beta_factor = kwargs.pop("layernorm_mlp_beta", mlp_beta_factor)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.attention_dropout = attention_dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        self.rope_parameters = self.rope_scaling
+        standardize_rope_params(self, rope_theta=rope_theta)
+        rope_config_validation(self)
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.router_jitter_noise = router_jitter_noise
+
+        if layer_types is None:
+            if attn_type_list is not None:
+                if len(attn_type_list) != num_hidden_layers:
+                    raise ValueError(
+                        f"attn_type_list length ({len(attn_type_list)}) must equal "
+                        f"num_hidden_layers ({num_hidden_layers})."
+                    )
+                self.layer_types = [
+                    "linear_attention" if int(attn_type) == 0 else "full_attention" for attn_type in attn_type_list
+                ]
+            else:
+                self.layer_types = [
+                    "full_attention" if bool((i + 1) % 2) else "linear_attention"
+                    for i in range(self.num_hidden_layers)
+                ]
+        else:
+            if len(layer_types) != num_hidden_layers:
+                raise ValueError(
+                    f"layer_types length ({len(layer_types)}) must equal num_hidden_layers ({num_hidden_layers})."
+                )
+            self.layer_types = list(layer_types)
+        self.attn_type_list = [0 if layer_type == "linear_attention" else 1 for layer_type in self.layer_types]
+        self.block_size = block_size
+        self.full_attn_alpha_factor = full_attn_alpha_factor
+        self.full_attn_beta_factor = full_attn_beta_factor
+        self.linear_attn_alpha_factor = linear_attn_alpha_factor
+        self.linear_attn_beta_factor = linear_attn_beta_factor
+        self.mlp_alpha_factor = mlp_alpha_factor
+        self.mlp_beta_factor = mlp_beta_factor
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            sliding_window=sliding_window,
+            **kwargs,
+        )
+
+
+__all__ = ["MiniMaxConfig"]