Adding MoEMLP layer to the layers file, integrate the MoE layer in Forecasting engine, and set up the config file to control the use of this layer

wael-mika · wael-mika · commit 36fea3ade9ae · 2025-09-29T13:43:34.000+02:00
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -152,3 +152,11 @@ run_id: ???
 train_log:
   # The period to log metrics (in number of batch steps)
   log_interval: 20
+  
+# Forecast MLP type: "dense" (default) or "moe"
+fe_mlp_type: "dense"    # set to "moe" to enable MoE
+
+# MoE-only params (ignored when fe_mlp_type != "moe")
+fe_moe_num_experts: 8
+fe_moe_top_k: 2
+fe_moe_hidden_factor: 0.5   # = HF_dense / 4
diff --git a/src/weathergen/model/engines.py b/src/weathergen/model/engines.py
@@ -318,18 +318,50 @@ def create(self) -> torch.nn.ModuleList:
                         )
                     )
                 # Add MLP block
-                self.fe_blocks.append(
-                    MLP(
-                        self.cf.ae_global_dim_embed,
-                        self.cf.ae_global_dim_embed,
-                        with_residual=True,
-                        dropout_rate=self.cf.fe_dropout_rate,
-                        norm_type=self.cf.norm_type,
-                        dim_aux=1,
-                        norm_eps=self.cf.mlp_norm_eps,
-                    )
+                use_moe = getattr(self.cf, "fe_mlp_type", "dense") == "moe"
+                mlp_common_kwargs = dict(
+                    dim_in=self.cf.ae_global_dim_embed,
+                    dim_out=self.cf.ae_global_dim_embed,
+                    with_residual=True,
+                    dropout_rate=self.cf.fe_dropout_rate,
+                    norm_type=self.cf.norm_type,
+                    dim_aux=1,
+                    norm_eps=self.cf.mlp_norm_eps,
                 )
-
+                # self.fe_blocks.append(
+                #     MLP(
+                #         self.cf.ae_global_dim_embed,
+                #         self.cf.ae_global_dim_embed,
+                #         with_residual=True,
+                #         dropout_rate=self.cf.fe_dropout_rate,
+                #         norm_type=self.cf.norm_type,
+                #         dim_aux=1,
+                #         norm_eps=self.cf.mlp_norm_eps,
+                #     )
+                # )
+                if use_moe:
+                    self.fe_blocks.append(
+                        MoEMLP(
+                            **mlp_common_kwargs,
+                            num_experts=getattr(self.cf, "fe_moe_num_experts", 8),
+                            top_k=getattr(self.cf, "fe_moe_top_k", 4),
+                            router_noisy_std=getattr(self.cf, "fe_moe_router_noisy_std", 0.0),
+                            hidden_factor=getattr(self.cf, "fe_moe_hidden_factor", 2),
+                        )
+                    )
+                else:
+                    self.fe_blocks.append(
+                        MLP(
+                            self.cf.ae_global_dim_embed,
+                            self.cf.ae_global_dim_embed,
+                            with_residual=True,
+                            dropout_rate=self.cf.fe_dropout_rate,
+                            norm_type=self.cf.norm_type,
+                            dim_aux=1,
+                            norm_eps=self.cf.mlp_norm_eps,
+                        )
+                    )
+                # ------------------------------------------------------------------
         def init_weights_final(m):
             if isinstance(m, torch.nn.Linear):
                 torch.nn.init.normal_(m.weight, mean=0, std=0.001)
diff --git a/src/weathergen/model/layers.py b/src/weathergen/model/layers.py
@@ -93,3 +93,212 @@ def forward(self, *args):
                 x = x + x_in.repeat([*[1 for _ in x.shape[:-1]], x.shape[-1] // x_in.shape[-1]])
 
         return x
+    
+class _DenseBlock(nn.Module):
+    """A tiny FFN that mirrors the structure of the current MLP stack."""
+    def __init__(self, dim_in, dim_hidden, dim_out, num_layers=2,
+                 nonlin=nn.GELU, dropout_rate=0.0):
+        super().__init__()
+        layers = [nn.Linear(dim_in, dim_hidden), nonlin(), nn.Dropout(dropout_rate)]
+        for _ in range(num_layers - 2):
+            layers += [nn.Linear(dim_hidden, dim_hidden), nonlin(), nn.Dropout(dropout_rate)]
+        layers += [nn.Linear(dim_hidden, dim_out)]
+        self.net = nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.net(x)
+
+class MoEMLP(nn.Module):
+    """
+    Drop-in MoE MLP (memory-friendly):
+    - Same call pattern as the current MLP: forward(*args) where args=(x, ...) and optional aux at the end
+    - Supports residual add exactly like MLP
+    - Optional AdaLayerNorm when dim_aux is provided
+    - Simple top-k router; mixes experts with streaming accumulation (no big [E, ..., D] stack)
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        num_layers=2,
+        hidden_factor=2,
+        pre_layer_norm=True,
+        dropout_rate=0.0,
+        nonlin=nn.GELU,
+        with_residual=False,
+        norm_type="LayerNorm",
+        dim_aux=None,
+        norm_eps=1e-5,
+        name: str | None = None,
+        # MoE bits
+        num_experts: int = 8,
+        top_k: int = 4,
+        router_noisy_std: float = 0.0,  # set >0 to add noise to router logits
+        # Memory bits
+        use_checkpoint: bool = False,    # checkpoint expert forward to save memory
+    ):
+        super().__init__()
+        if name is not None:
+            self.name = name
+
+        assert num_layers >= 2
+        assert 1 <= top_k <= num_experts
+
+        self.with_residual = with_residual
+        self.with_aux = dim_aux is not None
+        self.pre_layer_norm = pre_layer_norm
+        self.top_k = top_k
+        self.num_experts = num_experts
+        self.use_checkpoint = use_checkpoint
+
+        dim_hidden = int(dim_in * hidden_factor)
+
+        # Norm (match MLP behavior)
+        Norm = nn.LayerNorm if norm_type == "LayerNorm" else RMSNorm
+        if pre_layer_norm:
+            self.norm = (
+                Norm(dim_in, eps=norm_eps)
+                if dim_aux is None
+                else AdaLayerNorm(dim_in, dim_aux, norm_eps=norm_eps)
+            )
+        else:
+            self.norm = None  # no pre-norm
+
+        # Router
+        self.router = nn.Linear(dim_in, num_experts)
+        self.router_noisy_std = router_noisy_std
+
+        # Experts (identical shape)
+        self.experts = nn.ModuleList(
+            [
+                _DenseBlock(
+                    dim_in=dim_in,
+                    dim_hidden=dim_hidden,
+                    dim_out=dim_out,
+                    num_layers=num_layers,
+                    nonlin=nonlin,
+                    dropout_rate=dropout_rate,
+                )
+                for _ in range(num_experts)
+            ]
+        )
+
+        # For optional aux loss (load-balancing); not used unless you read it
+        self.register_buffer("last_aux_loss", torch.zeros((), dtype=torch.float32))
+
+    def _gate(self, x_norm):
+        # x_norm: [*, D]. Router works on the last dim.
+        logits = self.router(x_norm)
+        if self.router_noisy_std > 0:
+            logits = logits + torch.randn_like(logits) * self.router_noisy_std
+
+        if self.top_k == self.num_experts:
+            # softmax over all experts
+            weights = torch.softmax(logits, dim=-1)  # [..., E]
+            top_idx = None  # not needed
+        else:
+            # top-k softmax
+            top_vals, top_idx = torch.topk(logits, k=self.top_k, dim=-1)  # [*, k]
+            weights = torch.softmax(top_vals, dim=-1)                      # [*, k]
+        return weights, top_idx
+
+    @torch.no_grad()
+    def _compute_load_balance_aux(self, weights, top_idx, num_experts):
+        """
+        Simple load-balancing penalty from Switch/MoE papers:
+        Encourage uniform expert probability and uniform usage.
+        Works with both full-softmax (top_idx None) and top-k.
+        """
+        if top_idx is None:
+            # weights over E
+            probs = weights.mean(dim=tuple(range(weights.dim() - 1)))  # [E]
+        else:
+            # Build usage over experts from top-k selection
+            # *prefix, K = weights.shape
+            # flat_w = weights.reshape(-1, K)         # [N, K]
+            # flat_i = top_idx.reshape(-1, K)         # [N, K]
+            if weights.shape != top_idx.shape:
+                raise ValueError(
+                    "Top-k weights and indices must share the same shape"
+                )
+
+            K = weights.shape[-1]
+            flat_w = weights.reshape(-1, K)  # [N, K]
+            flat_i = top_idx.reshape(-1, K)  # [N, K]
+            E = num_experts
+            usage = torch.zeros(E, device=weights.device, dtype=weights.dtype)
+            usage.scatter_add_(0, flat_i.reshape(-1), flat_w.reshape(-1))
+            usage = usage / usage.sum().clamp_min(1e-6)  # normalize
+            probs = usage  # proxy
+        # Target is uniform 1/E
+        E = num_experts
+        target = torch.full_like(probs, 1.0 / E)
+        aux = (probs * (probs.add(1e-6).log() - target.add(1e-6).log())).sum()
+        return aux
+
+    def forward(self, *args):
+        # Match your MLP(*args) calling convention
+        x = args[0]
+        x_in = x
+        aux = args[-1] if self.with_aux else None
+
+        # Optional pre-norm (possibly adaptive)
+        if self.norm is not None:
+            if self.with_aux:
+                x = self.norm(x, aux)
+            else:
+                x = self.norm(x)
+
+        # Router
+        weights, top_idx = self._gate(x)  # weights: [..., E] or [..., K]
+
+        # Build a full weight tensor [..., E] if we are in top-k mode,
+        # so we can stream over experts without stacking their outputs.
+        if top_idx is None:
+            w_full = weights  # [..., E]
+        else:
+            # scatter top-k weights into a zero tensor of size E
+            E = self.num_experts
+            w_full = torch.zeros(*weights.shape[:-1], E, device=weights.device, dtype=weights.dtype)  # [..., E]
+            w_full.scatter_(-1, top_idx, weights)
+
+        # Output accumulator (no expert stacking)
+        out_dim = self.experts[0].net[-1].out_features  # last Linear of _DenseBlock
+        y = x.new_zeros(*x.shape[:-1], out_dim)
+
+        # Optional gradient checkpoint
+        if self.use_checkpoint:
+            from torch.utils.checkpoint import checkpoint
+
+        # Stream over experts: y += expert(x) * w_full[..., e]
+        for e, expert in enumerate(self.experts):
+            # skip compute if weight mass is (nearly) zero for this expert
+            w_e = w_full[..., e]  # [...]
+            if torch.allclose(w_e, torch.zeros((), device=w_e.device, dtype=w_e.dtype)):
+                continue
+
+            if self.use_checkpoint and self.training:
+                y_e = checkpoint(expert, x)
+            else:
+                y_e = expert(x)
+            y = y + y_e * w_e.unsqueeze(-1)
+
+        # Residual (same logic as your MLP)
+        if self.with_residual:
+            if y.shape[-1] == x_in.shape[-1]:
+                y = x_in + y
+            else:
+                assert y.shape[-1] % x_in.shape[-1] == 0
+                y = y + x_in.repeat([*[1 for _ in y.shape[:-1]], y.shape[-1] // x_in.shape[-1]])
+
+        # Optional: update aux loss (not returned; read if you want)
+        with torch.no_grad():
+            self.last_aux_loss = self._compute_load_balance_aux(
+                # w_full if top_idx is not None else weights,  # use full probs if we built them
+                # None if top_idx is None else top_idx,
+                weights,
+                top_idx,                
+                self.num_experts,
+            )
+
+        return y