Add weight_decay and mask arguments to adabelief optimizer.

carlosgmartin · carlosgmartin · commit 405a6a369281 · 2025-05-01T18:38:59.000-04:00
diff --git a/optax/_src/alias.py b/optax/_src/alias.py
@@ -39,6 +39,8 @@ def adabelief(
     eps_root: float = 1e-16,
     *,
     nesterov: bool = False,
+    weight_decay: float = 1e-4,
+    mask: Optional[Union[Any, Callable[[base.Params], Any]]] = None,
 ) -> base.GradientTransformationExtraArgs:
   r"""The AdaBelief optimizer.
 
@@ -94,6 +96,16 @@ def adabelief(
       improve numerical stability. If backpropagating gradients through the
       gradient transformation (e.g. for meta-learning), this must be non-zero.
     nesterov: Whether to use Nesterov momentum.
+    weight_decay: Strength of the weight decay regularization. Note that this
+      weight decay is multiplied with the learning rate. This is consistent
+      with other frameworks such as PyTorch, but different from
+      (Loshchilov et al, 2019) where the weight decay is only multiplied with
+      the "schedule multiplier", but not the base learning rate.
+    mask: A tree with same structure as (or a prefix of) the params PyTree,
+      or a Callable that returns such a pytree given the params/updates.
+      The leaves should be booleans, `True` for leaves/subtrees you want to
+      apply the weight decay to, and `False` for those you want to skip. Note
+      that the Adam gradient transformations are applied to all parameters.
 
   Returns:
     The corresponding :class:`optax.GradientTransformationExtraArgs`.
@@ -134,6 +146,7 @@ def adabelief(
           eps_root=eps_root,
           nesterov=nesterov,
       ),
+      transform.add_decayed_weights(weight_decay, mask),
       transform.scale_by_learning_rate(learning_rate),
   )