refactor: refactor example with new partitioner

XuehaiPan · XuehaiPan · commit 6ba8eaeddb12 · 2022-09-24T21:21:35.000+08:00
diff --git a/examples/distributed/few-shot/maml_omniglot.py b/examples/distributed/few-shot/maml_omniglot.py
@@ -186,15 +186,18 @@ def partitioner(net_rref, x_spt, y_spt, x_qry, y_qry, n_inner_iter, task_num):
     return partitions
 
 
-def reducer(results):
+def transpose_mean_reducer(results):
     qry_losses, qry_accs = tuple(zip(*results))
     qry_loss = torch.mean(torch.stack(qry_losses))
     qry_acc = np.mean(qry_accs)
     return qry_loss, qry_acc
 
 
-@todist.parallelize(partitioner=partitioner, reducer=reducer)
-def inner_loop(net_rref, x_spt, y_spt, x_qry, y_qry, n_inner_iter, task_num):
+@todist.parallelize(
+    partitioner=todist.dim_partitioner(dim=0, exclusive=True, keepdim=False),
+    reducer=transpose_mean_reducer,
+)
+def inner_loop(net_rref, x_spt, y_spt, x_qry, y_qry, n_inner_iter):
     if torch.cuda.is_available():
         device = torch.device(f'cuda:{todist.get_local_rank() % torch.cuda.device_count()}')
         torch.cuda.set_device(device)
@@ -235,8 +238,6 @@ def train(db: OmniglotNShot, net: nn.Module, meta_opt: optim.Adam, epoch: int, l
         # Sample a batch of support and query images and labels.
         x_spt, y_spt, x_qry, y_qry = db.next()
 
-        task_num = x_spt.size(0)
-
         # TODO: Maybe pull this out into a separate module so it
         # doesn't have to be duplicated between `train` and `test`?
 
@@ -246,15 +247,7 @@ def train(db: OmniglotNShot, net: nn.Module, meta_opt: optim.Adam, epoch: int, l
 
         meta_opt.zero_grad()
         with todist.autograd.context() as context_id:
-            qry_loss, qry_acc = inner_loop(
-                net_rref,
-                x_spt,
-                y_spt,
-                x_qry,
-                y_qry,
-                n_inner_iter,
-                task_num,
-            )
+            qry_loss, qry_acc = inner_loop(net_rref, x_spt, y_spt, x_qry, y_qry, n_inner_iter)
             todist.autograd.backward(context_id, qry_loss)
             meta_opt.step()
 
@@ -295,21 +288,11 @@ def test(db, net, epoch, log):
     for _ in range(n_test_iter):
         x_spt, y_spt, x_qry, y_qry = db.next('test')
 
-        task_num = x_spt.size(0)
-
         # TODO: Maybe pull this out into a separate module so it
         # doesn't have to be duplicated between `train` and `test`?
         n_inner_iter = 5
 
-        qry_loss, qry_acc = inner_loop(
-            net_rref,
-            x_spt,
-            y_spt,
-            x_qry,
-            y_qry,
-            n_inner_iter,
-            task_num,
-        )
+        qry_loss, qry_acc = inner_loop(net_rref, x_spt, y_spt, x_qry, y_qry, n_inner_iter)
         qry_losses.append(qry_loss.item())
         qry_accs.append(qry_acc)
 
diff --git a/torchopt/distributed/api.py b/torchopt/distributed/api.py
@@ -33,12 +33,17 @@
 import torch
 import torch.distributed.rpc as rpc
 
-from torchopt.distributed.world import get_world_size
-from torchopt.typing import Future
+import torchopt.pytree as pytree
+from torchopt.distributed.world import get_worker_id, get_world_rank, get_world_size
+from torchopt.typing import Future, PyTree
 
 
 __all__ = [
-    'default_partitioner',
+    'DimPartitioner',
+    'dim_partitioner',
+    'batch_partitioner',
+    'mean_reducer',
+    'sum_reducer',
     'remote_async_call',
     'remote_sync_call',
     'parallelize',
@@ -47,32 +52,197 @@
 ]
 
 
+UNSET_RPC_TIMEOUT = rpc.api.UNSET_RPC_TIMEOUT
+
+
 T = TypeVar('T')
 U = TypeVar('U')
 Args = Tuple[Any, ...]
 KwArgs = Dict[str, Any]
-Partitioner = Union[int, str, Callable[..., Sequence[Tuple[int, Optional[Args], Optional[KwArgs]]]]]
+PartitionFunction = Callable[..., Sequence[Tuple[int, Optional[Args], Optional[KwArgs]]]]
+Partitioner = Union[int, str, PartitionFunction]
 
 
-def default_partitioner(
-    *args: Any,
-    **kwargs: Any,
-) -> List[Tuple[int, Optional[Args], Optional[KwArgs]]]:
-    """Default partitioner.
+class DimPartitioner:
+    """Partitioner class that partitions a batch of inputs along a given dimension.
 
-    Replicates the arguments to all workers.
+    Args:
+        dim: The dimension to partition.
+        exclusive: Whether to partition the batch exclusively.
+            If ``exclusive=True``, the batch will be partitioned into ``batch_size`` partitions,
+            where ``batch_size`` is the size of the batch along the given dimension.
+            If ``exclusive=False``, the batch will be partitioned into
+            ``min(batch_size, num_workers)`` partitions, where ``num_workers`` is the number of
+            workers in the world.
+        keepdim: Whether to keep the partitioned dimension. Defaults to :data:`True`, i.e., keep the
+            batch dimension. If :data:`False`, use select instead of slicing. This functionality
+            should be used with ``exclusive=True``.
+        workers: The workers to partition the batch to. If :data:`None`, the batch will be
+            partitioned to all workers in the world.
     """
-    return [(rank, args, kwargs.copy()) for rank in range(get_world_size())]
+
+    def __init__(
+        self,
+        dim: int,
+        *,
+        exclusive: bool = False,
+        keepdim: bool = False,
+        workers: Optional[Sequence[Union[int, str]]] = None,
+    ) -> None:
+        if not keepdim and not exclusive:
+            raise ValueError('keepdim=False should be used with exclusive=True.')
+
+        self.dim = dim
+        self.exclusive = exclusive
+        self.keepdim = keepdim
+        self.workers = workers
+
+    # pylint: disable-next=too-many-branches,too-many-locals
+    def __call__(
+        self,
+        *args: Any,
+        **kwargs: Any,
+    ) -> List[Tuple[int, Optional[Args], Optional[KwArgs]]]:
+        if self.workers is None:
+            workers = list(range(get_world_size()))
+        else:
+            workers = self.workers
+        workers: List[int] = list(map(get_worker_id, workers))
+        num_workers = len(workers)
+
+        args_tree: PyTree[Any] = (args, kwargs)
+        flattened_args, treedef = pytree.tree_flatten(args_tree)
+
+        batch_size = None
+        for arg in flattened_args:
+            if isinstance(arg, torch.Tensor):
+                if batch_size is None:
+                    batch_size = arg.shape[self.dim]
+                elif batch_size != arg.shape[self.dim]:
+                    raise ValueError(
+                        f'Batch size mismatch on dim={self.dim}. '
+                        f'Expected {batch_size}, got {arg.shape[self.dim]} (shape: {arg.shape}).'
+                    )
+
+        if batch_size is None:
+            return [(get_world_rank(), args, kwargs.copy())]
+
+        if self.exclusive:
+            num_replicas = batch_size
+            if self.keepdim:
+                batch_slices = [slice(i, i + 1) for i in range(num_replicas)]
+            else:
+                batch_slices = list(range(num_replicas))
+        else:
+            if batch_size <= num_workers:
+                num_replicas = batch_size
+                batch_slices = [slice(i, i + 1) for i in range(batch_size)]  # keepdim=True
+            else:
+                num_replicas = num_workers
+                local_size = batch_size // num_workers
+                local_batch_indices = [i * local_size for i in range(num_workers)] + [batch_size]
+                batch_slices = [
+                    slice(local_batch_indices[i], local_batch_indices[i + 1])
+                    for i in range(num_workers)
+                ]
+
+        if self.dim >= 0:
+            batch_slices = [(slice(),) * self.dim + (batch_slice,) for batch_slice in batch_slices]
+        elif self.dim < 0:
+            batch_slices = [
+                (
+                    ...,
+                    batch_slice,
+                )
+                + (slice(),) * (-self.dim - 1)
+                for batch_slice in batch_slices
+            ]
+
+        flattened_args_replicas = [[] for _ in range(num_replicas)]
+        for arg in flattened_args:
+            if isinstance(arg, torch.Tensor):
+                for i, batch_slice in enumerate(batch_slices):
+                    flattened_args_replicas[i].append(arg[batch_slice])
+            else:
+                for i in range(num_replicas):
+                    flattened_args_replicas[i].append(arg)
+
+        args_replicas = [
+            pytree.tree_unflatten(treedef, args_replica) for args_replica in flattened_args_replicas
+        ]
+
+        return [
+            (workers[i % num_workers], worker_args, worker_kwargs)
+            for i, (worker_args, worker_kwargs) in enumerate(args_replicas)
+        ]
+
+    def __reduce__(
+        self,
+    ) -> Tuple[
+        Callable[..., 'DimPartitioner'],
+        Tuple[int],
+        Dict[str, Union[bool, Optional[Sequence[Union[int, str]]]]],
+    ]:
+        return (
+            DimPartitioner,
+            (self.dim,),
+            dict(exclusive=self.exclusive, keepdim=self.keepdim, workers=self.workers),
+        )
+
+
+def dim_partitioner(
+    dim: int = 0,
+    *,
+    exclusive: bool = False,
+    keepdim: bool = True,
+    workers: Optional[Sequence[Union[int, str]]] = None,
+) -> PartitionFunction:
+    """Partition a batch of inputs along a given dimension.
+
+    Args:
+        dim: The dimension to partition.
+        exclusive: Whether to partition the batch exclusively.
+            If ``exclusive=True``, the batch will be partitioned into ``batch_size`` partitions,
+            where ``batch_size`` is the size of the batch along the given dimension.
+            If ``exclusive=False``, the batch will be partitioned into
+            ``min(batch_size, num_workers)`` partitions, where ``num_workers`` is the number of
+            workers in the world.
+        keepdim: Whether to keep the partitioned dimension. Defaults to :data:`True`, i.e., keep the
+            batch dimension. If :data:`False`, use select instead of slicing. This functionality
+            should be used with ``exclusive=True``.
+        workers: The workers to partition the batch to. If :data:`None`, the batch will be
+            partitioned to all workers in the world.
+
+    Returns:
+        A partition function.
+    """
+    return DimPartitioner(dim, exclusive=exclusive, keepdim=keepdim, workers=workers)
+
+
+# pylint: disable=line-too-long
+batch_partitioner: PartitionFunction = dim_partitioner(dim=0, keepdim=True, exclusive=False)
+"""Partitioner for batch dimension. Divide and replicates the arguments to all workers along the first dimension."""
+# pylint: enable=line-too-long
+
+
+def mean_reducer(results: Iterable[torch.Tensor]) -> torch.Tensor:
+    """Reduce the results by averaging them."""
+    return torch.mean(torch.stack(tuple(results), dim=0), dim=0)
+
+
+def sum_reducer(results: Iterable[torch.Tensor]) -> torch.Tensor:
+    """Reduce the results by summing them."""
+    return torch.sum(torch.stack(tuple(results), dim=0), dim=0)
 
 
 def remote_async_call(
     func: Callable[..., T],
     *,
     args: Optional[Args] = None,
     kwargs: Optional[KwArgs] = None,
-    partitioner: Partitioner = default_partitioner,
+    partitioner: Partitioner = batch_partitioner,
     reducer: Optional[Callable[[Iterable[T]], U]] = None,
-    timeout: Optional[float] = rpc.api.UNSET_RPC_TIMEOUT,
+    timeout: Optional[float] = UNSET_RPC_TIMEOUT,
 ) -> Union[Future[List[T]], Future[U]]:
     # pylint: disable=line-too-long
     """Asynchronously do an RPC on remote workers and return the a :class:`torch.Future` instance at the current worker.
@@ -84,7 +254,7 @@ def remote_async_call(
         kwargs (Optional[KwArgs], optional): The keyword arguments to pass to the function. Defaults
             to :data:`None`.
         partitioner (Partitioner, optional): A partitioner that partitions the arguments to multiple
-            workers. Defaults to :func:`default_partitioner`.
+            workers. Defaults to :func:`batch_partitioner`.
         reducer (Callable[[Iterable[T]], U], optional): A reducer that reduces the results from
             multiple workers. Defaults to :data:`None`.
         timeout (float, optional): The timeout for the RPC call. Defaults to
@@ -97,10 +267,8 @@ def remote_async_call(
         args = ()
     if kwargs is None:
         kwargs = {}
-    if isinstance(partitioner, int):
-        partitions = [(partitioner, args, kwargs)]
-    elif isinstance(partitioner, str):
-        partitions = [(rpc.get_worker_info(worker_name=partitioner).id, args, kwargs)]
+    if isinstance(partitioner, (int, str)):
+        partitions = [(get_worker_id(id=partitioner), args, kwargs)]
     elif callable(partitioner):
         partitions = partitioner(*args, **kwargs)  # type: ignore[assignment]
     else:
@@ -128,9 +296,9 @@ def remote_sync_call(
     *,
     args: Optional[Args] = None,
     kwargs: Optional[KwArgs] = None,
-    partitioner: Partitioner = default_partitioner,
+    partitioner: Partitioner = batch_partitioner,
     reducer: Optional[Callable[[Iterable[T]], U]] = None,
-    timeout: Optional[float] = rpc.api.UNSET_RPC_TIMEOUT,
+    timeout: Optional[float] = UNSET_RPC_TIMEOUT,
 ) -> Union[List[T], U]:
     """Synchronously do an RPC on remote workers and return the result to the current worker.
 
@@ -141,7 +309,7 @@ def remote_sync_call(
         kwargs (Optional[KwArgs], optional): The keyword arguments to pass to the function. Defaults
             to :data:`None`.
         partitioner (Partitioner, optional): A partitioner that partitions the arguments to multiple
-            workers. Defaults to :func:`default_partitioner`.
+            workers. Defaults to :func:`batch_partitioner`.
         reducer (Callable[[Iterable[T]], U], optional): A reducer that reduces the results from
             multiple workers. Defaults to :data:`None`.
         timeout (float, optional): The timeout for the RPC call. Defaults to
@@ -161,9 +329,9 @@ def remote_sync_call(
 
 
 def parallelize_async(
-    partitioner: Partitioner = default_partitioner,
+    partitioner: Partitioner = batch_partitioner,
     reducer: Optional[Callable[[Iterable[T]], U]] = None,
-    timeout: Optional[float] = rpc.api.UNSET_RPC_TIMEOUT,
+    timeout: Optional[float] = UNSET_RPC_TIMEOUT,
 ) -> Callable[[Callable[..., T]], Callable[..., Union[Future[List[T]], Future[U]]]]:
     """Decorator for parallelizing a function.
 
@@ -173,7 +341,7 @@ def parallelize_async(
 
     Args:
         partitioner (Partitioner, optional): A partitioner that partitions the arguments to multiple
-            workers. Defaults to :func:`default_partitioner`.
+            workers. Defaults to :func:`batch_partitioner`.
         reducer (Callable[[Iterable[T]], U], optional): A reducer that reduces the results from
             multiple workers. Defaults to :data:`None`.
         timeout (float, optional): The timeout for the RPC call. Defaults to
@@ -214,17 +382,17 @@ def wrapped(*args: Any, **kwargs: Any) -> Union[Future[List[T]], Future[U]]:
 
 
 def parallelize(
-    partitioner: Partitioner = default_partitioner,
+    partitioner: Partitioner = batch_partitioner,
     reducer: Optional[Callable[[Iterable[T]], U]] = None,
-    timeout: Optional[float] = rpc.api.UNSET_RPC_TIMEOUT,
+    timeout: Optional[float] = UNSET_RPC_TIMEOUT,
 ) -> Callable[[Callable[..., T]], Callable[..., Union[List[T], U]]]:
     """Decorator for parallelizing a function.
 
     This decorator can be used to parallelize a function call across multiple workers.
 
     Args:
         partitioner (Partitioner, optional): A partitioner that partitions the arguments to multiple
-            workers. Defaults to :func:`default_partitioner`.
+            workers. Defaults to :func:`batch_partitioner`.
         reducer (Callable[[Iterable[T]], U], optional): A reducer that reduces the results from
             multiple workers. Defaults to :data:`None`.
         timeout (float, optional): The timeout for the RPC call. Defaults to
diff --git a/torchopt/distributed/world.py b/torchopt/distributed/world.py