feat(torchopt): adagrad optimizer support

XuehaiPan · Benjamin-eecs · XuehaiPan · commit 289292945ff6 · 2022-11-03T21:57:39.000+08:00
Co-authored-by: Benjamin-eecs &lt;benjaminliu.eecs@gmail.com&gt;
diff --git a/torchopt/alias/__init__.py b/torchopt/alias/__init__.py
@@ -31,10 +31,11 @@
 # ==============================================================================
 r"""The aliases of preset :class:`GradientTransformation`\s for optimizers."""
 
+from torchopt.alias.adagrad import adagrad
 from torchopt.alias.adam import adam
 from torchopt.alias.adamw import adamw
 from torchopt.alias.rmsprop import rmsprop
 from torchopt.alias.sgd import sgd
 
 
-__all__ = ['adam', 'adamw', 'rmsprop', 'sgd']
+__all__ = ['adagrad', 'adam', 'adamw', 'rmsprop', 'sgd']
diff --git a/torchopt/alias/adagrad.py b/torchopt/alias/adagrad.py
@@ -0,0 +1,102 @@
+# Copyright 2022 MetaOPT Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This file is modified from:
+# https://github.com/deepmind/optax/blob/master/optax/_src/alias.py
+# ==============================================================================
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preset :class:`GradientTransformation` for the AdaGrad optimizer."""
+
+from torchopt.alias.utils import flip_sign_and_add_weight_decay, scale_by_neg_lr
+from torchopt.combine import chain_flat
+from torchopt.transform import scale_by_rss
+from torchopt.typing import GradientTransformation, ScalarOrSchedule
+
+
+__all__ = ['adagrad']
+
+
+# pylint: disable-next=too-many-arguments
+def adagrad(
+    lr: ScalarOrSchedule = 1e-2,
+    lr_decay: float = 0.0,
+    weight_decay: float = 0.0,
+    initial_accumulator_value: float = 0.0,
+    eps: float = 1e-10,
+    *,
+    maximize: bool = False,
+) -> GradientTransformation:
+    """The functional AdaGrad optimizer.
+
+    AdaGrad is an algorithm for gradient based optimization that anneals the learning rate for each
+    parameter during the course of training.
+    WARNING: AdaGrad's main limit is the monotonic accumulation of squared gradients in the
+    denominator: since all terms are >0, the sum keeps growing during training and the learning rate
+    eventually becomes vanishingly small.
+
+    References:
+        Duchi et al, 2011: https://jmlr.org/papers/v12/duchi11a.html
+
+    Args:
+        lr: (default: :const:`1e-3`)
+            This is a fixed global scaling factor.
+        lr_decay: (default: :const:`0.0`)
+            Learning rate decay.
+        weight_decay: (default: :const:`0.0`)
+            Weight decay, add L2 penalty to parameters.
+        initial_accumulator_value: (default: :const:`0.0`)
+            Initial value for the accumulator.
+        eps: (default: :const:`1e-8`)
+            A small constant applied to denominator outside of the square root (as in the Adam
+            paper) to avoid dividing by zero when rescaling.
+        maximize: (default: :data:`False`)
+            Maximize the params based on the objective, instead of minimizing.
+        use_accelerated_op: (default: :data:`False`)
+            If :data:`True` use our implemented fused operator.
+
+    Returns:
+        The corresponding :class:`GradientTransformation` instance.
+
+    See Also:
+        The functional optimizer wrapper :class:`torchopt.FuncOptimizer`.
+    """
+    # pylint: disable=unneeded-not
+    if not (callable(lr) or 0.0 <= lr):
+        raise ValueError(f'Invalid learning rate: {lr}')
+    if not 0.0 <= eps:
+        raise ValueError(f'Invalid epsilon value: {eps}')
+    if not 0.0 <= lr_decay:
+        raise ValueError(f'Invalid lr_decay value: {lr_decay}')
+    if not 0.0 <= weight_decay:
+        raise ValueError(f'Invalid weight_decay value: {weight_decay}')
+    # pylint: enable=unneeded-not
+
+    return chain_flat(
+        flip_sign_and_add_weight_decay(weight_decay=weight_decay, maximize=maximize),
+        scale_by_rss.flat(initial_accumulator_value=initial_accumulator_value, eps=eps),  # type: ignore[attr-defined]
+        scale_by_neg_lr(lr),
+    )
diff --git a/torchopt/schedule/__init__.py b/torchopt/schedule/__init__.py
@@ -31,7 +31,8 @@
 # ==============================================================================
 """Learning rate schedules."""
 
+from torchopt.schedule.exponential_decay import exponential_decay
 from torchopt.schedule.polynomial import linear_schedule, polynomial_schedule
 
 
-__all__ = ['polynomial_schedule', 'linear_schedule']
+__all__ = ['exponential_decay', 'polynomial_schedule', 'linear_schedule']
diff --git a/torchopt/schedule/exponential_decay.py b/torchopt/schedule/exponential_decay.py
@@ -0,0 +1,92 @@
+# Copyright 2022 MetaOPT Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This file is modified from:
+# https://github.com/deepmind/optax/blob/master/optax/_src/schedule.py
+# ==============================================================================
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Exponential learning rate decay."""
+
+import logging
+from typing import Optional
+
+from torchopt.typing import Numeric, Scalar, Schedule
+
+
+__all__ = ['exponential_decay']
+
+
+def exponential_decay(
+    init_value: Scalar,
+    decay_rate: Scalar,
+    transition_begin: int = 0,
+    transition_steps: Optional[int] = None,
+    end_value: Optional[float] = None,
+) -> Schedule:
+    """Constructs a schedule with either continuous or discrete exponential decay.
+    Args:
+    value: value to be held constant throughout.
+    Returns:
+    schedule: A function that maps step counts to values.
+    """
+    if transition_steps is not None and transition_steps <= 0:
+        logging.info(
+            'An linear schedule was set with a non-positive `transition_steps`'
+            ' value; this will result in a constant schedule with value '
+            '`init_value`.'
+        )
+        return lambda count: init_value
+
+    if decay_rate == 0:
+        logging.info(
+            'An linear schedule was set with a zero `decay_rate` value; '
+            'this will result in a constant schedule with value `init_value`.'
+        )
+        return lambda count: init_value
+
+    if transition_begin < 0:
+        logging.info(
+            'An linear schedule was set with a negative `transition_begin` '
+            'value; this will result in `transition_begin` falling back to `0`.'
+        )
+        transition_begin = 0
+
+    if end_value is not None:
+        clip_fn = max if decay_rate < 1.0 else min
+
+    def schedule(count: Numeric) -> Numeric:
+        decreased_count = count - transition_begin
+        decayed_value = (
+            init_value / (1 + (decreased_count - 1) * decay_rate)
+            if decreased_count > 0
+            else init_value
+        )
+        if end_value is not None:
+            decayed_value = clip_fn(decayed_value, end_value)
+        return decayed_value
+
+    return schedule
diff --git a/torchopt/transform/__init__.py b/torchopt/transform/__init__.py
@@ -35,6 +35,7 @@
 from torchopt.transform.scale import scale
 from torchopt.transform.scale_by_adam import scale_by_accelerated_adam, scale_by_adam
 from torchopt.transform.scale_by_rms import scale_by_rms
+from torchopt.transform.scale_by_rss import scale_by_rss
 from torchopt.transform.scale_by_schedule import scale_by_schedule
 from torchopt.transform.scale_by_stddev import scale_by_stddev
 from torchopt.transform.trace import trace
@@ -47,6 +48,7 @@
     'add_decayed_weights',
     'scale_by_adam',
     'scale_by_accelerated_adam',
+    'scale_by_rss',
     'scale_by_rms',
     'scale_by_stddev',
 ]
diff --git a/torchopt/transform/scale_by_rss.py b/torchopt/transform/scale_by_rss.py
@@ -0,0 +1,127 @@
+# Copyright 2022 MetaOPT Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# This file is modified from:
+# https://github.com/deepmind/optax/blob/master/optax/_src/transform.py
+# ==============================================================================
+# Copyright 2019 DeepMind Technologies Limited. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preset transformations for scaling updates by the root of the sum of all squared gradients."""
+
+from typing import NamedTuple
+
+import torch
+
+from torchopt import pytree
+from torchopt.base import GradientTransformation
+from torchopt.transform.utils import tree_map_flat
+from torchopt.typing import Updates
+
+
+__all__ = ['scale_by_rss']
+
+
+class ScaleByRssState(NamedTuple):
+    """State holding the sum of gradient squares to date."""
+
+    sum_of_squares: Updates
+
+
+def scale_by_rss(
+    initial_accumulator_value: float = 0.1,
+    eps: float = 1e-7,
+) -> GradientTransformation:
+    """Rescale updates by the root of the sum of all squared gradients to date.
+
+    References:
+        [Duchi et al, 2011](https://jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+        [McMahan et al., 2010](https://arxiv.org/abs/1002.4908)
+
+    Args:
+        initial_accumulator_value: Starting value for accumulators, must be >= 0.
+        eps: A small floating point value to avoid zero denominator.
+
+    Returns:
+        An (init_fn, update_fn) tuple.
+    """
+    return _scale_by_rss(
+        initial_accumulator_value=initial_accumulator_value,
+        eps=eps,
+        already_flattened=False,
+    )
+
+
+def _scale_by_rss_flat(
+    initial_accumulator_value: float = 0.1,
+    eps: float = 1e-7,
+) -> GradientTransformation:
+    return _scale_by_rss(
+        initial_accumulator_value=initial_accumulator_value,
+        eps=eps,
+        already_flattened=True,
+    )
+
+
+def _scale_by_rss(
+    initial_accumulator_value: float = 0.1,
+    eps: float = 1e-7,
+    *,
+    already_flattened: bool = False,
+) -> GradientTransformation:
+
+    if already_flattened:
+        tree_map = tree_map_flat
+    else:
+        tree_map = pytree.tree_map
+
+    def init_fn(params):
+        sum_of_squares = tree_map(lambda t: torch.full_like(t, initial_accumulator_value), params)
+        return ScaleByRssState(sum_of_squares=sum_of_squares)
+
+    def update_fn(updates, state, params=None, inplace=True):  # pylint: disable=unused-argument
+        sum_of_squares = tree_map(
+            lambda g, t: (g.conj() * g).real + t, updates, state.sum_of_squares
+        )
+
+        if inplace:
+
+            def f(t):
+                return t.add_(eps).rsqrt_() if t > 0.0 else 0.0
+
+        else:
+
+            def f(t):
+                return t.add(eps).rsqrt() if t > 0.0 else 0.0
+
+        inv_sqrt_g_square = tree_map(f, sum_of_squares)
+        updates = tree_map(lambda scale, g: scale * g, inv_sqrt_g_square, updates)
+        return updates, ScaleByRssState(sum_of_squares=sum_of_squares)
+
+    return GradientTransformation(init_fn, update_fn)
+
+
+scale_by_rss.flat = _scale_by_rss_flat  # type: ignore[attr-defined]
+scale_by_rss.impl = _scale_by_rss  # type: ignore[attr-defined]