feat(torchopt): adagrad optimizer support

Benjamin-eecs · XuehaiPan · commit 1c7691e7b929 · 2022-10-11T21:01:12.000+08:00
diff --git a/torchopt/alias.py b/torchopt/alias.py
@@ -137,6 +137,79 @@ def schedule_wrapper(count):
     return transform._scale(-lr, already_flattened=True)  # pylint: disable=protected-access
 
 
+# pylint: disable-next=too-many-arguments
+def adagrad(
+    lr: ScalarOrSchedule = 1e-2,
+    lr_decay: float = 0.0,
+    weight_decay: float = 0.0,
+    initial_accumulator_value: float = 0.0,
+    eps: float = 1e-10,
+    *,
+    eps_root: float = 0.0,  # pylint: disable=unused-argument
+    moment_requires_grad: bool = False,  # pylint: disable=unused-argument
+    maximize: bool = False,
+) -> base.GradientTransformation:
+    """The functional Adagrad optimizer.
+
+    Adagrad is an algorithm for gradient based optimization that anneals the
+    learning rate for each parameter during the course of training.\
+
+    WARNING: Adagrad's main limit is the monotonic accumulation of squared
+    gradients in the denominator: since all terms are >0, the sum keeps growing
+    during training and the learning rate eventually becomes vanishingly small.
+
+    References:
+        Duchi et al, 2011: https://jmlr.org/papers/v12/duchi11a.html
+
+    Args:
+        lr: (default: :const:`1e-3`)
+            This is a fixed global scaling factor.
+        lr_decay: (default: :const:`0.0`)
+            Learning rate decay.
+        weight_decay: (default: :const:`0.0`)
+            Weight decay, add L2 penalty to parameters.
+        initial_accumulator_value: (default: :const:`0.0`)
+            Initial value for the accumulator.
+        eps: (default: :const:`1e-8`)
+            A small constant applied to denominator outside of the square root (as in the Adam
+            paper) to avoid dividing by zero when rescaling.
+        eps_root: (default: :data:`0.0`)
+            A small constant applied to denominator inside the square root (as in RMSProp), to avoid
+            dividing by zero when rescaling. This is needed for example when computing
+            (meta-)gradients through Adam.
+        moment_requires_grad: (default: :data:`False`)
+            If :data:`True` the momentums will be created with flag ``requires_grad=True``, this
+            flag is often used in Meta-Learning algorithms.
+        maximize: (default: :data:`False`)
+            Maximize the params based on the objective, instead of minimizing.
+        use_accelerated_op: (default: :data:`False`)
+            If :data:`True` use our implemented fused operator.
+
+    Returns:
+        The corresponding :class:`GradientTransformation` instance.
+
+    See Also:
+        The functional optimizer wrapper :class:`torchopt.FuncOptimizer`.
+    """
+    # pylint: disable=unneeded-not
+    if not (callable(lr) or 0.0 <= lr):
+        raise ValueError(f'Invalid learning rate: {lr}')
+    if not 0.0 <= eps:
+        raise ValueError(f'Invalid epsilon value: {eps}')
+    if not 0.0 <= lr_decay:
+        raise ValueError(f'Invalid lr_decay value: {lr_decay}')
+    if not 0.0 <= weight_decay:
+        raise ValueError(f'Invalid weight_decay value: {weight_decay}')
+    # pylint: enable=unneeded-not
+    return transform.with_flattened_tree(
+        combine.chain(
+            _flip_sign_and_weight_decay(weight_decay=weight_decay, maximize=maximize),
+            transform.scale_by_rss(initial_accumulator_value=initial_accumulator_value, eps=eps),
+            _scale_by_neg_lr(lr),
+        )
+    )
+
+
 # pylint: disable-next=too-many-arguments
 def adam(
     lr: ScalarOrSchedule = 1e-3,
diff --git a/torchopt/schedule.py b/torchopt/schedule.py
@@ -32,6 +32,7 @@
 """Learning rate schedules."""
 
 import logging
+from typing import Optional
 
 import numpy as np
 import torch
@@ -42,6 +43,58 @@
 __all__ = ['polynomial_schedule', 'linear_schedule']
 
 
+def linear_decay(
+    init_value: Scalar,
+    decay_rate: Scalar,
+    transition_begin: int = 0,
+    transition_steps: Optional[int] = None,
+    end_value: Optional[float] = None,
+) -> base.Schedule:
+    """Constructs a schedule with either continuous or discrete exponential decay.
+    Args:
+    value: value to be held constant throughout.
+    Returns:
+    schedule: A function that maps step counts to values.
+    """
+    if transition_steps is not None and transition_steps <= 0:
+        logging.info(
+            'An linear schedule was set with a non-positive `transition_steps`'
+            ' value; this will result in a constant schedule with value '
+            '`init_value`.'
+        )
+        return lambda count: init_value
+
+    if decay_rate == 0:
+        logging.info(
+            'An linear schedule was set with a zero `decay_rate` value; '
+            'this will result in a constant schedule with value `init_value`.'
+        )
+        return lambda count: init_value
+
+    if transition_begin < 0:
+        logging.info(
+            'An linear schedule was set with a negative `transition_begin` '
+            'value; this will result in `transition_begin` falling back to `0`.'
+        )
+        transition_begin = 0
+
+    if end_value is not None:
+        clip_fn = max if decay_rate < 1.0 else min
+
+    def schedule(count: Numeric) -> Numeric:
+        decreased_count = count - transition_begin
+        decayed_value = (
+            init_value / (1 + (decreased_count - 1) * decay_rate)
+            if decreased_count > 0
+            else init_value
+        )
+        if end_value is not None:
+            decayed_value = clip_fn(decayed_value, end_value)
+        return decayed_value
+
+    return schedule
+
+
 def polynomial_schedule(
     init_value: Scalar,
     end_value: Scalar,
diff --git a/torchopt/transform.py b/torchopt/transform.py
@@ -78,7 +78,7 @@ def inc_count(updates: Updates, count: TensorTree) -> TensorTree:
     """Increments int counter by one.
 
     Returns:
-        A counter incremeted by one, or max_int if the maximum precision is reached.
+        A counter incremented by one, or max_int if the maximum precision is reached.
     """
     return _inc_count(updates=updates, count=count, already_flattened=False)
 
@@ -265,7 +265,7 @@ def scale_by_adam(
             Term added to the denominator inside the square-root to improve
             numerical stability when back-propagating gradients through the rescaling.
         moment_requires_grad: (default: :data:`False`)
-            if :data:`True`, states will be created with flag `requires_grad = True`.
+            If :data:`True`, states will be created with flag `requires_grad = True`.
 
     Returns:
         An (init_fn, update_fn) tuple.
@@ -367,7 +367,7 @@ def scale_by_accelerated_adam(
             Term added to the denominator inside the square-root to improve
             numerical stability when back-propagating gradients through the rescaling.
         moment_requires_grad: (default: :data:`False`)
-            if :data:`True`, states will be created with flag `requires_grad = True`.
+            If :data:`True`, states will be created with flag `requires_grad = True`.
 
     Returns:
         An (init_fn, update_fn) tuple.
@@ -474,7 +474,7 @@ def trace(
         nesterov: (default: :data:`False`)
             Whether to use Nesterov momentum.
         moment_requires_grad: (default: :data:`False`)
-            if :data:`True`, states will be created with flag `requires_grad = True`.
+            If :data:`True`, states will be created with flag `requires_grad = True`.
 
     Returns:
         An (init_fn, update_fn) tuple.
@@ -597,7 +597,7 @@ def scale_by_rms(
         eps: (default: :const:`1e-8`)
             Term added to the denominator to improve numerical stability.
         initial_scale: (default: :const:`0.0`)
-            Initial value for second moment
+            Initial value for second moment.
 
     Returns:
         An (init_fn, update_fn) tuple.
@@ -675,7 +675,7 @@ def scale_by_stddev(
         eps: (default: :const:`1e-8`)
             Term added to the denominator to improve numerical stability.
         initial_scale: (default: :const:`0.0`)
-            Initial value for second moment
+            Initial value for second moment.
 
     Returns:
         An (init_fn, update_fn) tuple.
@@ -745,9 +745,8 @@ class MaskedState(NamedTuple):
 class MaskedNode(NamedTuple):
     """A node used to mask out unspecified parts of a tree.
 
-    This node is ignored when mapping functions across the tree e.g. using
-    :func:`pytree.tree_map` since it is a container without children. It can
-    therefore be used to mask out parts of a tree.
+    This node is ignored when mapping functions across the tree e.g. using :func:`pytree.tree_map`
+    since it is a container without children. It can therefore be used to mask out parts of a tree.
     """
 
 
@@ -757,28 +756,27 @@ def masked(
 ) -> GradientTransformation:
     """Mask updates so only some are transformed, the rest are passed through.
 
-    For example, it is common to skip weight decay for BatchNorm scale and all
-    bias parameters. In many networks, these are the only parameters with only
-    one dimension. So, you may create a mask function to mask these out as
-    follows::
-      mask_fn = lambda p: pytree.tree_map(lambda x: x.ndim != 1, p)
-      weight_decay = torchopt.masked(torchopt.add_decayed_weights(0.001), mask_fn)
+    For example, it is common to skip weight decay for BatchNorm scale and all bias parameters. In
+    many networks, these are the only parameters with only one dimension. So, you may create a mask
+    function to mask these out as follows::
+        mask_fn = lambda p: pytree.tree_map(lambda x: x.ndim != 1, p)
+        weight_decay = torchopt.masked(torchopt.add_decayed_weights(0.001), mask_fn)
     You may alternatively create the mask pytree upfront::
-      mask = pytree.tree_map(lambda x: x.ndim != 1, params)
-      weight_decay = torchopt.masked(torchopt.add_decayed_weights(0.001), mask)
+        mask = pytree.tree_map(lambda x: x.ndim != 1, params)
+        weight_decay = torchopt.masked(torchopt.add_decayed_weights(0.001), mask)
     For the ``inner`` transform, state will only be stored for the parameters that
-    have a mask value of ``True``.
+    have a mask value of :data:`True`.
 
     Args:
-      inner: Inner transformation to mask.
-      mask: a PyTree with same structure as (or a prefix of) the params PyTree, or
-        a Callable that returns such a pytree given the params/updates. The leaves
-        should be booleans, ``True`` for leaves/subtrees you want to apply the
-        transformation to, and ``False`` for those you want to skip. The mask must
-        be static for the gradient transformation to be jit-compilable.
+        inner: Inner transformation to mask.
+        mask: A PyTree with same structure as (or a prefix of) the params pytree, or a Callable that
+            returns such a pytree given the params/updates. The leaves should be booleans,
+            :data:`True` for leaves/subtrees you want to apply the transformation to, and
+            :data:`False` for those you want to skip. The mask must be static for the gradient
+            transformation to be jit-compilable.
 
     Returns:
-      New GradientTransformation wrapping ``inner``.
+        A new :class:`GradientTransformation` wrapping ``inner``.
     """
     return _masked(
         inner=inner,
@@ -831,17 +829,17 @@ def add_decayed_weights(
     weight_decay: float = 0.0,
     mask: Optional[Union[Any, Callable[[Params], Any]]] = None,
 ) -> GradientTransformation:
-    """Add parameter scaled by `weight_decay`.
+    """Add parameter scaled by ``weight_decay``.
 
     Args:
-        weight_decay: a scalar weight decay rate.
-        mask: a tree with same structure as (or a prefix of) the params PyTree,
-            or a Callable that returns such a pytree given the params/updates.
-            The leaves should be booleans, `True` for leaves/subtrees you want to
-            apply the transformation to, and `False` for those you want to skip.
+        weight_decay: A scalar weight decay rate.
+        mask: A tree with same structure as (or a prefix of) the params pytree, or a Callable that
+            returns such a pytree given the params/updates. The leaves should be booleans,
+            :data:`True` for leaves/subtrees you want to apply the transformation to, and
+            :data:`False` for those you want to skip.
 
     Returns:
-      An (init_fn, update_fn) tuple.
+        An (init_fn, update_fn) tuple.
     """
     return _add_decayed_weights(
         weight_decay=weight_decay,
@@ -902,3 +900,74 @@ def f(g, p):
             already_flattened=already_flattened,
         )
     return GradientTransformation(init_fn, update_fn)
+
+
+class ScaleByRssState(NamedTuple):
+    """State holding the sum of gradient squares to date."""
+
+    sum_of_squares: Updates
+
+
+def scale_by_rss(
+    initial_accumulator_value: float = 0.1,
+    eps: float = 1e-7,
+) -> GradientTransformation:
+    """Rescale updates by the root of the sum of all squared gradients to date.
+
+    References:
+        [Duchi et al, 2011](https://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+        [McMahan et al., 2010](https://arxiv.org/abs/1002.4908)
+
+    Args:
+        initial_accumulator_value: Starting value for accumulators, must be >= 0.
+        eps: A small floating point value to avoid zero denominator.
+
+    Returns:
+        An (init_fn, update_fn) tuple.
+    """
+    return _scale_by_rss(
+        initial_accumulator_value=initial_accumulator_value,
+        eps=eps,
+        already_flattened=False,
+    )
+
+
+def _scale_by_rss(
+    initial_accumulator_value: float = 0.1,
+    eps: float = 1e-7,
+    *,
+    already_flattened: bool = False,
+) -> GradientTransformation:
+
+    if already_flattened:
+        tree_map = map_flattened
+    else:
+        tree_map = pytree.tree_map
+
+    def init_fn(params):
+        sum_of_squares = tree_map(lambda t: torch.full_like(t, initial_accumulator_value), params)
+        return ScaleByRssState(sum_of_squares=sum_of_squares)
+
+    def update_fn(updates, state, params=None, inplace=True):  # pylint: disable=unused-argument
+        del params
+        sum_of_squares = tree_map(
+            lambda g, t: (g.conj() * g).real + t, updates, state.sum_of_squares
+        )
+        # inv_sqrt_g_square = tree_map(
+        #     lambda t: jnp.where(t > 0, jax.lax.rsqrt(t + eps), 0.0), sum_of_squares
+        # )
+        if inplace:
+
+            def f(t):
+                return t.add_(eps).rsqrt_() if t > 0.0 else 0.0
+
+        else:
+
+            def f(t):
+                return t.add(eps).rsqrt() if t > 0.0 else 0.0
+
+        inv_sqrt_g_square = tree_map(f, sum_of_squares)
+        updates = tree_map(lambda scale, g: scale * g, inv_sqrt_g_square, updates)
+        return updates, ScaleByRssState(sum_of_squares=sum_of_squares)
+
+    return GradientTransformation(init_fn, update_fn)