| | |
| | |
| | |
| | |
| |
|
| | """Configuration for the MiniMax M2 architecture.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | from typing import List, Optional, Union |
| |
|
| | from transformers.configuration_utils import PretrainedConfig |
| |
|
| |
|
| | class MiniMaxM2Config(PretrainedConfig): |
| | model_type = "minimax" |
| |
|
| | def __init__( |
| | self, |
| | vocab_size: int = 200_064, |
| | hidden_size: int = 3_072, |
| | intermediate_size: int = 1_536, |
| | mlp_intermediate_size: int = 8_192, |
| | num_hidden_layers: int = 62, |
| | num_attention_heads: int = 48, |
| | num_key_value_heads: int = 8, |
| | head_dim: Optional[int] = 128, |
| | num_local_experts: int = 256, |
| | num_experts_per_tok: int = 8, |
| | attn_type_list: Optional[List[int]] = None, |
| | attention_dropout: float = 0.0, |
| | hidden_act: str = "silu", |
| | rms_norm_eps: float = 1e-6, |
| | max_position_embeddings: int = 196_608, |
| | rope_theta: float = 5_000_000.0, |
| | rotary_dim: int = 64, |
| | rope_scaling: Optional[dict] = None, |
| | use_qk_norm: bool = True, |
| | qk_norm_type: str = "per_layer", |
| | use_routing_bias: bool = True, |
| | scoring_func: str = "sigmoid", |
| | router_aux_loss_coef: float = 0.001, |
| | router_jitter_noise: float = 0.0, |
| | output_router_logits: bool = False, |
| | use_grouped_topk: bool = True, |
| | num_expert_group: Optional[int] = None, |
| | topk_group: Optional[int] = None, |
| | routed_scaling_factor: float = 1.0, |
| | layernorm_full_attention_beta: float = 1.0, |
| | layernorm_linear_attention_beta: float = 1.0, |
| | layernorm_mlp_beta: float = 1.0, |
| | shared_intermediate_size: int = 0, |
| | shared_moe_mode: str = "sigmoid", |
| | use_mtp: bool = True, |
| | num_mtp_modules: int = 3, |
| | mtp_transformer_layers: int = 1, |
| | attn_window_size: Optional[Union[int, List[int]]] = None, |
| | swa_rope_theta: float = -1.0, |
| | sliding_window: Optional[int] = None, |
| | initializer_range: float = 0.02, |
| | tie_word_embeddings: bool = False, |
| | max_model_len: Optional[int] = None, |
| | bos_token_id: Optional[int] = None, |
| | eos_token_id: Optional[int] = None, |
| | pad_token_id: Optional[int] = None, |
| | use_cache: bool = True, |
| | **kwargs, |
| | ) -> None: |
| | quantization_config = kwargs.pop("quantization_config", None) |
| | transformers_version = kwargs.pop("transformers_version", None) |
| |
|
| | super().__init__( |
| | bos_token_id=bos_token_id, |
| | eos_token_id=eos_token_id, |
| | tie_word_embeddings=tie_word_embeddings, |
| | pad_token_id=pad_token_id, |
| | **kwargs, |
| | ) |
| |
|
| | self.vocab_size = vocab_size |
| | self.hidden_size = hidden_size |
| | self.intermediate_size = intermediate_size |
| | self.mlp_intermediate_size = mlp_intermediate_size |
| | self.num_hidden_layers = num_hidden_layers |
| | self.num_attention_heads = num_attention_heads |
| | self.num_key_value_heads = num_key_value_heads |
| | self.head_dim = head_dim or hidden_size // num_attention_heads |
| | self.num_local_experts = num_local_experts |
| | self.num_experts_per_tok = num_experts_per_tok |
| | self.attn_type_list = attn_type_list or [1] * num_hidden_layers |
| | self.attention_dropout = attention_dropout |
| | self.hidden_act = hidden_act |
| | self.rms_norm_eps = rms_norm_eps |
| | self.max_position_embeddings = max_position_embeddings |
| | self.rope_theta = rope_theta |
| | self.rotary_dim = rotary_dim |
| | self.rope_scaling = rope_scaling |
| | self.use_qk_norm = use_qk_norm |
| | self.qk_norm_type = qk_norm_type |
| | self.use_routing_bias = use_routing_bias |
| | self.scoring_func = scoring_func |
| | self.router_aux_loss_coef = router_aux_loss_coef |
| | self.router_jitter_noise = router_jitter_noise |
| | self.output_router_logits = output_router_logits |
| | self.use_grouped_topk = use_grouped_topk |
| | self.num_expert_group = num_expert_group |
| | self.topk_group = topk_group |
| | self.routed_scaling_factor = routed_scaling_factor |
| | self.layernorm_full_attention_beta = layernorm_full_attention_beta |
| | self.layernorm_linear_attention_beta = layernorm_linear_attention_beta |
| | self.layernorm_mlp_beta = layernorm_mlp_beta |
| | self.shared_intermediate_size = shared_intermediate_size |
| | self.shared_moe_mode = shared_moe_mode |
| | self.use_mtp = use_mtp |
| | self.num_mtp_modules = num_mtp_modules |
| | self.mtp_transformer_layers = mtp_transformer_layers |
| | self.attn_window_size = attn_window_size |
| | self.swa_rope_theta = swa_rope_theta |
| | self.sliding_window = sliding_window |
| | self.initializer_range = initializer_range |
| | self.max_model_len = max_model_len |
| | self.use_cache = use_cache |
| |
|
| | |
| | self.partial_rotary_factor = float(self.rotary_dim) / float(self.head_dim) |
| | if quantization_config is not None: |
| | self.quantization_config = quantization_config |
| | self.transformers_version = transformers_version |
| |
|
| |
|
| | __all__ = ["MiniMaxM2Config"] |
| |
|