metaopt · XuehaiPan · Sep 5, 2022 · Jul 27, 2022 · Aug 4, 2022 · Aug 4, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- Implement AdamW optimizer with masking by [@Benjamin-eecs](https://github.com/Benjamin-eecs) and [@XuehaiPan](https://github.com/XuehaiPan) in [#44](https://github.com/metaopt/torchopt/pull/44).
 - Add half float support for accelerated OPs by [@XuehaiPan](https://github.com/XuehaiPan) in [#67](https://github.com/metaopt/torchopt/pull/67).
 - Add MAML example with TorchRL integration by [@vmoens](https://github.com/vmoens) and [@Benjamin-eecs](https://github.com/Benjamin-eecs) in [#12](https://github.com/metaopt/TorchOpt/pull/12).
 - Add optional argument `params` to update function in gradient transformations by [@XuehaiPan](https://github.com/XuehaiPan) in [#65](https://github.com/metaopt/torchopt/pull/65).

diff --git a/docs/source/api/api.rst b/docs/source/api/api.rst
@@ -32,12 +32,18 @@ Functional Optimizers
     adam
     sgd
     rmsprop
+    adamw
 
 Functional Adam Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: adam
 
+Functional AdamW Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: adamw
+
 Functional SGD Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -60,12 +66,18 @@ Classic Optimizers
     Adam
     SGD
     RMSProp
+    AdamW
 
 Classic Adam Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: Adam
 
+Classic AdamW Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: AdamW
+
 Classic SGD Optimizer
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -88,12 +100,18 @@ Differentiable Meta-Optimizers
     MetaAdam
     MetaSGD
     MetaRMSProp
+    MetaAdamW
 
 Differentiable Meta-Adam Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: MetaAdam
 
+Differentiable Meta-AdamW Optimizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: MetaAdamW
+
 Differentiable Meta-SGD Optimizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
@@ -73,3 +73,8 @@ CPython
 nn
 Vincent
 Moens
+AdamW
+Loshchilov
+pytree
+booleans
+subtrees
diff --git a/tests/test_alias.py b/tests/test_alias.py
@@ -154,6 +154,64 @@ def test_adam(
     weight_decay=[0.0, 1e-2],
     maximize=[False, True],
 )
+def test_adamw(
+    dtype: torch.dtype,
+    lr: float,
+    betas: Tuple[float, float],
+    eps: float,
+    inplace: bool,
+    weight_decay: float,
+    maximize: bool,
+) -> None:
+    model, model_ref, model_base, loader = helpers.get_models(device='cpu', dtype=dtype)
+
+    fmodel, params, buffers = functorch.make_functional_with_buffers(model)
+    optim = torchopt.adamw(
+        lr,
+        betas=betas,
+        eps=eps,
+        eps_root=0.0,
+        weight_decay=weight_decay,
+        maximize=maximize,
+    )
+    optim_state = optim.init(params)
+    optim_ref = torch.optim.AdamW(
+        model_ref.parameters(),
+        lr,
+        betas=betas,
+        eps=eps,
+        amsgrad=False,
+        weight_decay=weight_decay,
+        maximize=maximize,
+    )
+
+    for xs, ys in loader:
+        xs = xs.to(dtype=dtype)
+        pred = fmodel(params, buffers, xs)
+        pred_ref = model_ref(xs)
+        loss = F.cross_entropy(pred, ys)
+        loss_ref = F.cross_entropy(pred_ref, ys)
+
+        grads = torch.autograd.grad(loss, params)
+        updates, optim_state = optim.update(grads, optim_state, params=params, inplace=inplace)
+        params = torchopt.apply_updates(params, updates, inplace=inplace)
+
+        optim_ref.zero_grad()
+        loss_ref.backward()
+        optim_ref.step()
+
+    helpers.assert_model_all_close((params, buffers), model_ref, model_base, dtype=dtype)
+
+
+@helpers.parametrize(
+    dtype=[torch.float64],
+    lr=[1e-2, 1e-3, 1e-4],
+    betas=[(0.9, 0.999), (0.95, 0.9995)],
+    eps=[1e-8],
+    inplace=[True, False],
+    weight_decay=[1e-2, 1e-1],
+    maximize=[False, True],
+)
 def test_adam_accelerated_cpu(
     dtype: torch.dtype,
     lr: float,

diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
@@ -138,6 +138,61 @@ def test_Adam(
     helpers.assert_model_all_close(model, model_ref, model_base, dtype=dtype)
 
 
+@helpers.parametrize(
+    dtype=[torch.float64],
+    lr=[1e-2, 1e-3, 1e-4],
+    betas=[(0.9, 0.999), (0.95, 0.9995)],
+    eps=[1e-8],
+    weight_decay=[1e-2, 1e-1],
+    maximize=[False, True],
+)
+def test_AdamW(
+    dtype: torch.dtype,
+    lr: float,
+    betas: Tuple[float, float],
+    eps: float,
+    weight_decay: float,
+    maximize: bool,
+) -> None:
+    model, model_ref, model_base, loader = helpers.get_models(device='cpu', dtype=dtype)
+
+    optim = torchopt.AdamW(
+        model.parameters(),
+        lr,
+        betas=betas,
+        eps=eps,
+        eps_root=0.0,
+        weight_decay=weight_decay,
+        maximize=maximize,
+    )
+    optim_ref = torch.optim.AdamW(
+        model_ref.parameters(),
+        lr,
+        betas=betas,
+        eps=eps,
+        amsgrad=False,
+        weight_decay=weight_decay,
+        maximize=maximize,
+    )
+
+    for xs, ys in loader:
+        xs = xs.to(dtype=dtype)
+        pred = model(xs)
+        pred_ref = model_ref(xs)
+        loss = F.cross_entropy(pred, ys)
+        loss_ref = F.cross_entropy(pred_ref, ys)
+
+        optim.zero_grad()
+        loss.backward()
+        optim.step()
+
+        optim_ref.zero_grad()
+        loss_ref.backward()
+        optim_ref.step()
+
+    helpers.assert_model_all_close(model, model_ref, model_base, dtype=dtype)
+
+
 @helpers.parametrize(
     dtype=[torch.float64],
     lr=[1e-2, 1e-3, 1e-4],

diff --git a/torchopt/__init__.py b/torchopt/__init__.py
@@ -15,11 +15,18 @@
 """TorchOpt: a high-performance optimizer library built upon PyTorch."""
 
 from torchopt._src import accelerated_op_available, clip, combine, hook, schedule, visual
-from torchopt._src.alias import adam, rmsprop, sgd
+from torchopt._src.alias import adam, adamw, rmsprop, sgd
 from torchopt._src.clip import clip_grad_norm
 from torchopt._src.combine import chain
-from torchopt._src.optimizer import SGD, Adam, Optimizer, RMSProp, RMSprop, meta
-from torchopt._src.optimizer.meta import MetaAdam, MetaOptimizer, MetaRMSProp, MetaRMSprop, MetaSGD
+from torchopt._src.optimizer import SGD, Adam, AdamW, Optimizer, RMSProp, RMSprop, meta
+from torchopt._src.optimizer.meta import (
+    MetaAdam,
+    MetaAdamW,
+    MetaOptimizer,
+    MetaRMSProp,
+    MetaRMSprop,
+    MetaSGD,
+)
 from torchopt._src.update import apply_updates
 from torchopt._src.utils import extract_state_dict, recover_state_dict, stop_gradient
 from torchopt.version import __version__
@@ -33,18 +40,21 @@
     'schedule',
     'visual',
     'adam',
+    'adamw',
     'rmsprop',
     'sgd',
     'clip_grad_norm',
     'chain',
     'Optimizer',
     'SGD',
     'Adam',
+    'AdamW',
     'RMSProp',
     'RMSprop',
     'MetaOptimizer',
     'MetaSGD',
     'MetaAdam',
+    'MetaAdamW',
     'MetaRMSProp',
     'MetaRMSprop',
     'apply_updates',