Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
结合业界主流优化方法与飞桨在业务实践中积累的高效特性,PaddleFormers 致力于打造**高性能、低资源占用**的训练体验,帮助用户高效便捷地完成大模型训练,而无需关注底层复杂的优化细节。

## 🆕最新更新
* 2026.03.31 - PaddleFormers v1.1 正式发布!在这个版本中我们支持了 GLM-4.5 系列模型的单步与多步 MTP 训练能力。依托 MTP 架构优势,开发者可显著提升推理效率;同时针对 MTP 模块训练场景,我们新增主干网络冻结开关,灵活满足各类模型精细化调优需求。此外,我们对视觉理解类模型进行了深度优化,Qwen3-VL 30B-A3B 模型性能相比上个版本提升48%,领先Megatron-LM 6%。
* 2026.03.31 - PaddleFormers v1.1 正式发布!在这个版本中我们支持了 GLM-4.5 系列模型的单步与多步 MTP 训练能力。依托 MTP 架构优势,开发者可显著提升推理效率;同时针对 MTP 模块训练场景,我们新增主干网络冻结开关,灵活满足各类模型精细化调优需求。此外,我们对视觉理解类模型进行了深度优化,Qwen3-VL 30B-A3B 模型性能相比上个版本提升48%,领先 Megatron-LM 6%。
* 2026.01.21 - PaddleFomers v1.0版本发布啦!我们提供了针对 LLM 和 VLM 等模型的训练能力,针对 DeepSeek-V3模型和 GLM-4.5-Air 等重点模型,我们实现了极致性能优化(训练性能明显超越 Megatron-LM )。针对 PaddleOCR-VL,我们在昆仑芯 P800、天数天垓150等国产计算芯片上进行了适配,更好的满足国内用户需求。

## ✨特性
Expand All @@ -50,7 +50,7 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
<tbody>
<!-- LLM 分类 - 跨行合并开始 -->
<tr>
<td rowspan="10" style="vertical-align: top;">LLM</td>
<td rowspan="11" style="vertical-align: top;">LLM</td>
<td>DeepSeekv3</td>
<td>deepseek-ai/DeepSeek-V3-Base、deepseek-ai/DeepSeek-V3、deepseek-ai/DeepSeek-V3-0324</td>
<td>deepseek3</td>
Expand Down Expand Up @@ -80,6 +80,11 @@ PaddleFormers 是基于百度深度学习框架 PaddlePaddle 搭建的 Transform
<td>meta-llama/Meta-Llama-3-8B、meta-llama/Meta-Llama-3-8B-Instruct、meta-llama/Meta-Llama-3-70B、meta-llama/Meta-Llama-3-70B-Instruct、meta-llama/Llama-3.1-8B、meta-llama/Llama-3.1-8B-Instruct、meta-llama/Llama-3.1-70B、meta-llama/Llama-3.1-70B-Instruct、meta-llama/Llama-3.1-405B、meta-llama/Llama-3.1-405B-Instruct、meta-llama/Llama-3.2-1B、meta-llama/Llama-3.2-1B-Instruct、meta-llama/Llama-3.2-3B、meta-llama/Llama-3.2-3B-Instruct、meta-llama/Llama-3.3-70B-Instruct</td>
<td>llama3</td>
</tr>
<tr>
<td>MiniMax-Text-01</td>
<td>MiniMaxAI/MiniMax-Text-01</td>
<td>minimax</td>
</tr>
<tr>
<td>phi-4</td>
<td>microsoft/phi-4</td>
Expand Down
2 changes: 2 additions & 0 deletions docs/zh/model_capability.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
|GLM-4.5|✓|✓|✓|✓|✓|
|GPT-OSS|✓|✓|✓|x|x|
|LLaMA3|✓|✓|✓|✓|✓|
|MiniMax-Text-01|✓|✓|✓|x|x|
|Phi4|✓|✓|✓|✓|✓|
|Qwen2|✓|✓|✓|✓|✓|
|Qwen3|✓|✓|✓|✓|✓|
Expand All @@ -25,6 +26,7 @@
|GLM-4.5|✓|✓|✓|✓|✓|✓|
|GPT-OSS|✓|✓|x|x|✓|✓|
|LLaMA3|✓|✓|-|x|✓|✓|
|MiniMax-Text-01|x|x|x|x|✓|✓|
|Phi4|✓|✓|-|x|✓|✓|
|Qwen2|✓|✓|x|x|✓|✓|
|Qwen3|✓|✓|✓|✓|✓|✓|
Expand Down
16 changes: 16 additions & 0 deletions paddleformers/datasets/template/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,22 @@ def get_template_and_fix_tokenizer(dataset_config) -> "Template":
thought_words=("<think>\n", "\n</think>\n\n"),
)

register_template(
name="minimax",
format_user=StringFormatter(
slots=[
"<beginning_of_sentence>user name=user\n{{content}}<end_of_sentence>\n"
"<beginning_of_sentence>ai name=assistant\n"
]
),
format_assistant=StringFormatter(slots=["{{content}}<end_of_sentence>"]),
format_system=StringFormatter(
slots=["<beginning_of_sentence>system ai_setting=assistant\n{{content}}<end_of_sentence>\n"]
),
chat_sep="<end_of_sentence>\n",
suffix=["<end_of_sentence>"],
)

register_template(
name="paddleocr_vl",
format_user=StringFormatter(slots=["User: {{content}}\nAssistant: "]),
Expand Down
3 changes: 3 additions & 0 deletions paddleformers/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@
"minimax_m2": ["MiniMaxM2ForCausalLMPipe", "MiniMaxM2ForCausalLM"],
"deepseek_v4.configuration": ["DeepseekV4Config"],
"deepseek_v4": ["DeepseekV4ForCausalLMPipe", "DeepseekV4ForCausalLM"],
"minimax.configuration": ["MiniMaxConfig"],
"minimax": ["MiniMaxModel", "MiniMaxForCausalLM", "MiniMaxForCausalLMPipe"],
"glm4v_moe.image_processor": ["Glm4vImageProcessor"],
"glm4v_moe.image_processor_fast": ["Glm4vImageProcessorFast"],
"auto": ["AutoModelForCausalLM"],
Expand Down Expand Up @@ -414,6 +416,7 @@
from .glm_moe_dsa import *
from .minimax_m2 import *
from .deepseek_v4 import *
from .minimax import *
from .gpt_oss import *
from .phi3 import *
from .gemma3_text import *
Expand Down
2 changes: 2 additions & 0 deletions paddleformers/transformers/auto/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
("qwen3_vl_moe_text", "Qwen3VLMoeTextConfig"),
("glm4_moe", "Glm4MoeConfig"),
("glm_moe_dsa", "GlmMoeDsaConfig"),
("minimax", "MiniMaxConfig"),
("minimax_m2", "MiniMaxM2Config"),
("deepseek_v4", "DeepseekV4Config"),
("gpt_oss", "GptOssConfig"),
Expand Down Expand Up @@ -89,6 +90,7 @@
("qwen3_vl_moe", "Qwen3VLMoe"),
("qwen3_vl_moe_text", "Qwen3VLMoeText"),
("glm_ocr", "GlmOcrForConditionalGeneration"),
("minimax", "MiniMaxForCausalLM"),
("qwen3_5_moe", "Qwen3_5MoEForConditionalGeneration"),
("qwen3_5", "Qwen3_5ForConditionalGeneration"),
]
Expand Down
1 change: 1 addition & 0 deletions paddleformers/transformers/auto/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
("Qwen3_5", "qwen3_5"),
("Glm4Moe", "glm4_moe"),
("GlmMoeDsa", "glm_moe_dsa"),
("MiniMax", "minimax"),
("MiniMaxM2", "minimax_m2"),
("DeepseekV4", "deepseek_v4"),
("GptOss", "gpt_oss"),
Expand Down
39 changes: 39 additions & 0 deletions paddleformers/transformers/minimax/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Package"""
import sys
from typing import TYPE_CHECKING

from ...utils.lazy_import import _LazyModule

import_structure = {
"configuration": ["MiniMaxConfig"],
"modeling": [
"MiniMaxPretrainedModel",
"MiniMaxModel",
"MiniMaxForCausalLM",
"MiniMaxForCausalLMPipe",
],
}

if TYPE_CHECKING:
from .configuration import *
from .modeling import *
else:
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
import_structure,
module_spec=__spec__,
)
224 changes: 224 additions & 0 deletions paddleformers/transformers/minimax/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" MiniMax (Text-01) model configuration"""

from ..configuration_utils import PretrainedConfig
from ..modeling_rope_utils import rope_config_validation, standardize_rope_params


class MiniMaxConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate a
MiniMax (Text-01) model according to the specified arguments, defining the model architecture.

MiniMax (Text-01) uses a hybrid attention mechanism:
- Some layers are full attention (standard causal self-attention with RoPE)
- Some layers are linear attention ("lightning attention") with intra-/inter-block attention

The layer type is controlled by `layer_types`. By default, the pattern is alternating
`full_attention` and `linear_attention` (full on odd-indexed layers, linear on even).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
vocab_size (`int`, *optional*, defaults to 200064):
Vocabulary size of the MiniMax model.
hidden_size (`int`, *optional*, defaults to 6144):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 9216):
Dimension of the routed expert MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 80):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 64):
Number of attention heads for each attention layer in the Transformer encoder.
num_key_value_heads (`int`, *optional*, defaults to 8):
Number of key_value heads for implementing Grouped Query Attention.
head_dim (`int`, *optional*):
Dimension of each attention head. If None, defaults to hidden_size // num_attention_heads.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 10240000):
The maximum sequence length that this model might ever be used with.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
rope_theta (`float`, *optional*, defaults to 10000000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
num_experts_per_tok (`int`, *optional*, defaults to 2):
Number of selected experts per token.
num_local_experts (`int`, *optional*, defaults to 32):
Number of routed experts.
output_router_logits (`bool`, *optional*, defaults to `False`):
Whether the router logits should be returned by the model.
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
Coefficient for the auxiliary load-balancing loss.
router_jitter_noise (`float`, *optional*, defaults to 0.0):
Jitter noise for the router.
layer_types (`list[str]`, *optional*):
A list that maps each layer index to its attention type. Can be `"full_attention"` or `"linear_attention"`.
block_size (`int`, *optional*, defaults to 256):
The length of each attention block for the lightning attention.
full_attn_alpha_factor (`float`, *optional*, defaults to 1):
Weight for residual value in residual connection after full attention.
full_attn_beta_factor (`float`, *optional*, defaults to 1):
Weight for hidden state value in residual connection after full attention.
linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
Weight for residual value in residual connection after lightning attention.
linear_attn_beta_factor (`float`, *optional*, defaults to 1):
Weight for hidden state value in residual connection after lightning attention.
mlp_alpha_factor (`float`, *optional*, defaults to 1):
Weight for residual value in residual connection after MLP.
mlp_beta_factor (`float`, *optional*, defaults to 1):
Weight for hidden state value in residual connection after MLP.

```python
>>> from paddleformers.transformers import MiniMaxModel, MiniMaxConfig

>>> # Initializing a MiniMax (Text-01) style configuration
>>> configuration = MiniMaxConfig()

>>> # Initializing a model from the MiniMax (Text-01) style configuration
>>> model = MiniMaxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "minimax"
keys_to_ignore_at_inference = ["past_key_values"]

def __init__(
self,
vocab_size=200064,
hidden_size=6144,
intermediate_size=9216,
num_hidden_layers=80,
num_attention_heads=64,
num_key_value_heads=8,
head_dim=None,
hidden_act="silu",
max_position_embeddings=10240000,
initializer_range=0.02,
rms_norm_eps=1e-5,
use_cache=True,
pad_token_id=None,
bos_token_id=None,
eos_token_id=None,
tie_word_embeddings=False,
sliding_window=None,
attention_dropout=0.0,
num_experts_per_tok=2,
num_local_experts=32,
output_router_logits=False,
router_aux_loss_coef=0.001,
router_jitter_noise=0.0,
attn_type_list=None,
rope_theta=10000000.0,
rope_scaling=None,
layer_types=None,
block_size=256,
full_attn_alpha_factor=1.0,
full_attn_beta_factor=1.0,
linear_attn_alpha_factor=1.0,
linear_attn_beta_factor=1.0,
mlp_alpha_factor=1.0,
mlp_beta_factor=1.0,
**kwargs,
):
full_attn_alpha_factor = kwargs.pop("layernorm_full_attention_alpha", full_attn_alpha_factor)
full_attn_beta_factor = kwargs.pop("layernorm_full_attention_beta", full_attn_beta_factor)
linear_attn_alpha_factor = kwargs.pop("layernorm_linear_attention_alpha", linear_attn_alpha_factor)
linear_attn_beta_factor = kwargs.pop("layernorm_linear_attention_beta", linear_attn_beta_factor)
mlp_alpha_factor = kwargs.pop("layernorm_mlp_alpha", mlp_alpha_factor)
mlp_beta_factor = kwargs.pop("layernorm_mlp_beta", mlp_beta_factor)

self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.attention_dropout = attention_dropout
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
self.rope_parameters = self.rope_scaling
standardize_rope_params(self, rope_theta=rope_theta)
rope_config_validation(self)

self.num_experts_per_tok = num_experts_per_tok
self.num_local_experts = num_local_experts
self.output_router_logits = output_router_logits
self.router_aux_loss_coef = router_aux_loss_coef
self.router_jitter_noise = router_jitter_noise

if layer_types is None:
if attn_type_list is not None:
if len(attn_type_list) != num_hidden_layers:
raise ValueError(
f"attn_type_list length ({len(attn_type_list)}) must equal "
f"num_hidden_layers ({num_hidden_layers})."
)
self.layer_types = [
"linear_attention" if int(attn_type) == 0 else "full_attention" for attn_type in attn_type_list
]
else:
self.layer_types = [
"full_attention" if bool((i + 1) % 2) else "linear_attention"
for i in range(self.num_hidden_layers)
]
else:
if len(layer_types) != num_hidden_layers:
raise ValueError(
f"layer_types length ({len(layer_types)}) must equal num_hidden_layers ({num_hidden_layers})."
)
self.layer_types = list(layer_types)
self.attn_type_list = [0 if layer_type == "linear_attention" else 1 for layer_type in self.layer_types]
self.block_size = block_size
self.full_attn_alpha_factor = full_attn_alpha_factor
self.full_attn_beta_factor = full_attn_beta_factor
self.linear_attn_alpha_factor = linear_attn_alpha_factor
self.linear_attn_beta_factor = linear_attn_beta_factor
self.mlp_alpha_factor = mlp_alpha_factor
self.mlp_beta_factor = mlp_beta_factor

super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
sliding_window=sliding_window,
**kwargs,
)


__all__ = ["MiniMaxConfig"]
Loading
Loading