autoprogrammer
/

MiniCPM-MoE-8x2B-sparsemixer

PyTorch

custom_code

Model card Files Files and versions

xet

Community

autoprogrammer commited on 27 days ago

Commit

f7b44bd

verified ·

1 Parent(s): 84b28e1

Update modeling_minicpm.py

Browse files

Files changed (1) hide show

modeling_minicpm.py +204 -8

modeling_minicpm.py CHANGED Viewed

@@ -48,7 +48,10 @@ from transformers.utils import (
     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
-from .configuration_minicpm import MiniCPMConfig
 import re
 try:
@@ -283,9 +286,183 @@ class MiniCPMMLP(nn.Module):
         return down_proj
 class AddAuxiliaryLoss(torch.autograd.Function):
     """
-    The trick function of adding auxiliary (aux) loss,
     which includes the gradient of the aux loss during backpropagation.
     """
     @staticmethod
@@ -304,7 +481,7 @@ class AddAuxiliaryLoss(torch.autograd.Function):
 class MiniCPMMoE(nn.Module):
-    def __init__(self, config):
         super().__init__()
         self.config = config
         self.num_experts = config.num_experts
@@ -314,16 +491,34 @@ class MiniCPMMoE(nn.Module):
         )
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
         self.intermediate_size = config.intermediate_size
     def forward(self, hidden_states):
         orig_shape = hidden_states.shape
         orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, orig_shape[-1])
         token_num = hidden_states.shape[0]
         scores = self.gate(hidden_states)
-        scores_prob = F.softmax(scores, dim=-1, dtype=torch.float32)
-        expert_weights, expert_indices = torch.topk(scores_prob, self.num_experts_per_tok, dim=-1)
-        expert_weights = expert_weights / expert_weights.sum(dim=-1, keepdim=True)
         topk_idx_flat = expert_indices.view(-1)
         expert_weights = expert_weights.to(orig_dtype)
@@ -333,8 +528,9 @@ class MiniCPMMoE(nn.Module):
             for i in range(self.num_experts):
                 y[topk_idx_flat == i] = self.experts[i](hidden_states[topk_idx_flat == i])
             y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
-            y =  y.view(*orig_shape)
             load = expert_indices.view(-1).bincount(minlength=self.num_experts)
             load_mean = load / (token_num * self.num_experts_per_tok)
             importance_mean = scores_prob.mean(dim=0)

     replace_return_docstrings,
 )
 from transformers.utils.import_utils import is_torch_fx_available
+try:
+    from .configuration_minicpm import MiniCPMConfig
+except ImportError:
+    from configuration_minicpm import MiniCPMConfig
 import re
 try:
         return down_proj
+# ============================================================================
+# SparseMixer v2 Routing Implementation
+# Based on https://github.com/fairinternal/SparseMixer
+# ============================================================================
+class SparseMixerCore(torch.autograd.Function):
+    """
+    Custom autograd function for SparseMixer v2 core operation.
+    Implements Heun's third-order method for gradient computation.
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        scores: torch.Tensor,
+        multiplier: torch.Tensor,
+        selected_experts: torch.Tensor,
+        masked_gates: torch.Tensor,
+        mask_for_one: torch.Tensor,
+    ):
+        ctx.save_for_backward(multiplier, selected_experts, masked_gates)
+        return multiplier * mask_for_one
+    @staticmethod
+    def backward(
+        ctx,
+        grad_at_output: torch.Tensor,
+    ):
+        multiplier, selected_experts, masked_gates = ctx.saved_tensors
+        grad_at_output = grad_at_output * multiplier
+        grad_at_scores_expaned = masked_gates * grad_at_output.mul(-1)
+        grad_at_scores_expaned.scatter_add_(
+            dim=-1,
+            index=selected_experts,
+            src=grad_at_output,
+        )
+        return (
+            grad_at_scores_expaned,
+            None,
+            None,
+            None,
+            None,
+        )
+def select_single_expert(
+    scores: torch.Tensor,
+    masked_scores: torch.Tensor,
+    jitter_eps: float,
+    training: bool,
+):
+    """
+    Select a single expert using SparseMixer v2 logic.
+    Args:
+        scores: Original scores (for threshold computation)
+        masked_scores: Scores with already-selected experts masked out
+        jitter_eps: Jitter epsilon for sparsity mask
+        training: Whether in training mode
+    Returns:
+        multiplier: Weight for the selected expert
+        selected_expert: Index of selected expert
+    """
+    with torch.no_grad():
+        # Compute mask for sparsity
+        mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True)
+        factor = scores.abs().clamp(min=mask_logits_threshold)
+        mask_logits_threshold = (
+            (mask_logits_threshold - scores) / factor
+        ) > (2 * jitter_eps)
+    # Apply mask
+    masked_gates = masked_scores.masked_fill(mask_logits_threshold, float('-inf'))
+    if training:
+        # Gumbel sampling for robustness
+        selected_expert = (
+            masked_gates - torch.empty_like(
+                masked_gates,
+                memory_format=torch.legacy_contiguous_format
+            ).exponential_().log()
+        ).max(dim=-1)[1].unsqueeze(-1)
+    else:
+        selected_expert = max_ind
+    # Compute scores for gradients
+    masked_gates = torch.softmax(masked_gates, dim=-1)
+    # Compute midpoint mask using Heun's third-order method
+    max_scores, max_ind = masked_gates.max(dim=-1, keepdim=True)
+    mask_for_one = torch.logical_or(
+        selected_expert == max_ind,
+        torch.rand_like(max_scores) > 0.75  # f(x) - f(0) = .25 f'(x) + .75 f'(x/3.)
+    )
+    # Map: 1 -> 1.0 & 0 -> 1./3
+    mask_for_one = torch.add(0.3333, mask_for_one, alpha=0.6667).type_as(masked_gates)
+    # Get multiplier
+    multiplier_o = masked_gates.gather(dim=-1, index=selected_expert)
+    multiplier = SparseMixerCore.apply(
+        scores,
+        multiplier_o,
+        selected_expert,
+        masked_gates,
+        mask_for_one,
+    )
+    return multiplier, selected_expert
+def sparsemixer_topk_routing(
+    scores: torch.Tensor,
+    top_k: int,
+    jitter_eps: float,
+    training: bool
+):
+    """
+    SparseMixer v2 routing extended to arbitrary top-k.
+    Args:
+        scores: Router logits of shape (batch_size, num_experts)
+        top_k: Number of experts to select
+        jitter_eps: Jitter epsilon for sparsity control
+        training: Whether in training mode
+    Returns:
+        multiplier: Weights for selected experts, shape (batch_size, top_k)
+        original_gates: Original softmax gates, shape (batch_size, num_experts)
+        selected_experts: Indices of selected experts, shape (batch_size, top_k)
+    """
+    original_gates = torch.softmax(scores, dim=-1)
+    all_multipliers = []
+    all_selected_experts = []
+    # Start with unmasked scores
+    masked_scores = scores.clone()
+    for k in range(top_k):
+        # Select k-th expert
+        multiplier, selected_expert = select_single_expert(
+            scores=scores,
+            masked_scores=masked_scores,
+            jitter_eps=jitter_eps,
+            training=training,
+        )
+        all_multipliers.append(multiplier)
+        all_selected_experts.append(selected_expert)
+        # Mask out the selected expert for next iteration
+        if k < top_k - 1:  # Don't need to mask on last iteration
+            masked_scores = torch.scatter(
+                masked_scores,
+                -1,
+                selected_expert,
+                float('-inf'),
+            )
+    # Concatenate all results
+    multiplier = torch.cat(all_multipliers, dim=-1)  # (batch_size, top_k)
+    selected_experts = torch.cat(all_selected_experts, dim=-1)  # (batch_size, top_k)
+    return multiplier, original_gates, selected_experts
+# ============================================================================
+# End of SparseMixer v2 Implementation
+# ============================================================================
 class AddAuxiliaryLoss(torch.autograd.Function):
     """
+    The trick function of adding auxiliary (aux) loss,
     which includes the gradient of the aux loss during backpropagation.
     """
     @staticmethod
 class MiniCPMMoE(nn.Module):
+    def __init__(self, config, jitter_eps=0.1):
         super().__init__()
         self.config = config
         self.num_experts = config.num_experts
         )
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
         self.intermediate_size = config.intermediate_size
+        # SparseMixer specific parameter
+        self.jitter_eps = jitter_eps
     def forward(self, hidden_states):
         orig_shape = hidden_states.shape
         orig_dtype = hidden_states.dtype
         hidden_states = hidden_states.view(-1, orig_shape[-1])
         token_num = hidden_states.shape[0]
+        # Compute router logits
         scores = self.gate(hidden_states)
+        # ===== SparseMixer v2 Routing =====
+        # Use SparseMixer v2 routing for expert selection
+        expert_weights, scores_prob, expert_indices = sparsemixer_topk_routing(
+            scores=scores,
+            top_k=self.num_experts_per_tok,
+            jitter_eps=self.jitter_eps,
+            training=self.training
+        )
+        # expert_weights: (token_num, top_k) - SparseMixer weights
+        # scores_prob: (token_num, num_experts) - Original softmax for loss computation
+        # expert_indices: (token_num, top_k) - Selected expert indices
+        # Normalize weights if needed (SparseMixer already provides normalized weights)
+        # expert_weights = expert_weights / expert_weights.sum(dim=-1, keepdim=True)
         topk_idx_flat = expert_indices.view(-1)
         expert_weights = expert_weights.to(orig_dtype)
             for i in range(self.num_experts):
                 y[topk_idx_flat == i] = self.experts[i](hidden_states[topk_idx_flat == i])
             y = (y.view(*expert_weights.shape, -1) * expert_weights.unsqueeze(-1)).sum(dim=1)
+            y = y.view(*orig_shape)
+            # Load balancing loss (using original softmax probabilities)
             load = expert_indices.view(-1).bincount(minlength=self.num_experts)
             load_mean = load / (token_num * self.num_experts_per_tok)
             importance_mean = scores_prob.mean(dim=0)