From 6e2f9c3dcf619f133f9fe52662e7c745c77e1fd0 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 3 Jul 2025 09:47:44 -0700 Subject: [PATCH 01/29] New whatsnew section (#10496) --- doc/whats-new.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 667bd3e3d98..ad83cfac531 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -5,6 +5,35 @@ What's New ========== +.. _whats-new.2025.07.1: + +v2025.07.1 (unreleased) +----------------------- + +New Features +~~~~~~~~~~~~ + + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + .. _whats-new.2025.07.0: v2025.07.0 (Jul 3, 2025) From f33ee6c34c2ff88333284ec7df91c87d850f5b7f Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Thu, 3 Jul 2025 11:45:48 -0700 Subject: [PATCH 02/29] Update HOW_TO_RELEASE.md to reflect inability to push to main (#10497) Addresses the comment in https://github.com/pydata/xarray/pull/10496 --- HOW_TO_RELEASE.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index e775d63871d..15be8c5d0f4 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -52,6 +52,7 @@ upstream https://github.com/pydata/xarray (push) 6. After merging, again ensure your main branch is synced to upstream: ```sh + git switch main git pull upstream main ``` 7. If you have any doubts, run the full test suite one final time! @@ -98,14 +99,15 @@ upstream https://github.com/pydata/xarray (push) ``` -12. Commit your changes and push to main again: +12. Make a PR with these changes and merge it: ```sh - git commit -am 'New whatsnew section' - git push upstream main + git checkout -b empty-whatsnew-YYYY.MM.X+1 + git commit -am "empty whatsnew" + git push ``` - You're done pushing to main! + (Note that repo branch restrictions prevent pushing to `main`, so you have to just-self-merge this.) 13. Update the version available on pyodide: From a8de733b0ab95b3ec99515b1ee99c66fec7d2a6a Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Fri, 4 Jul 2025 10:34:27 +0200 Subject: [PATCH 03/29] Enforce ruff/flake8-simplify rules (SIM) (#10480) * Apply ruff/flake8-simplify rule SIM102 SIM102 Use a single `if` statement instead of nested `if` statements * Apply ruff/flake8-simplify rule SIM113 SIM113 Use `enumerate()` for index variable in `for` loop * Apply ruff/flake8-simplify rule SIM114 SIM114 Combine `if` branches using logical `or` operator * Apply ruff/flake8-simplify rule SIM201 SIM201 Use `... != ...` instead of `not ... == ..."` * Apply ruff/flake8-simplify rule SIM905 SIM905 Consider using a list literal instead of `str.split` * Apply ruff/flake8-simplify rule SIM910 SIM910 Use `.get(...)` instead of `.get(..., None)` * Enforce ruff/flake8-simplify rules (SIM) * Move comment near the related code Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> * Ignore SIM102 as well SIM102 Use a single `if` statement instead of nested `if` statements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pyproject.toml | 6 ++++++ xarray/core/formatting.py | 14 ++++++++------ xarray/core/indexes.py | 2 +- xarray/core/resample_cftime.py | 26 ++++++++++---------------- xarray/groupers.py | 14 ++++++-------- xarray/plot/facetgrid.py | 2 +- xarray/plot/utils.py | 7 ++++--- xarray/structure/merge.py | 2 +- xarray/tests/test_coding.py | 4 ++-- xarray/tests/test_dask.py | 2 +- xarray/tests/test_groupby.py | 2 +- xarray/tests/test_units.py | 4 +--- xarray/util/print_versions.py | 2 +- 13 files changed, 43 insertions(+), 44 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8cfbb6851b3..5e5fd00328b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,6 +262,7 @@ extend-select = [ "PIE", # flake8-pie "TID", # flake8-tidy-imports (absolute imports) "PYI", # flake8-pyi + "SIM", # flake8-simplify "FLY", # flynt "I", # isort "PERF", # Perflint @@ -283,6 +284,11 @@ ignore = [ "PIE790", # unnecessary pass statement "PYI019", # use `Self` instead of custom TypeVar "PYI041", # use `float` instead of `int | float` + "SIM102", # use a single `if` statement instead of nested `if` statements + "SIM108", # use ternary operator instead of `if`-`else`-block + "SIM117", # use a single `with` statement instead of nested `with` statements + "SIM118", # use `key in dict` instead of `key in dict.keys()` + "SIM300", # yoda condition detected "PERF203", # try-except within a loop incurs performance overhead "E402", # module level import not at top of file "E731", # do not assign a lambda expression, use a def diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 14e70b0550c..94ff1e65ac0 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -989,9 +989,10 @@ def diff_array_repr(a, b, compat): ): summary.append(coords_diff) - if compat == "identical": - if attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat): - summary.append(attrs_diff) + if compat == "identical" and ( + attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat) + ): + summary.append(attrs_diff) return "\n".join(summary) @@ -1029,9 +1030,10 @@ def diff_dataset_repr(a, b, compat): ): summary.append(data_diff) - if compat == "identical": - if attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat): - summary.append(attrs_diff) + if compat == "identical" and ( + attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat) + ): + summary.append(attrs_diff) return "\n".join(summary) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 1756fb54c1b..847c3a4edb9 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1247,7 +1247,7 @@ def create_variables( level = name dtype = self.level_coords_dtype[name] # type: ignore[index] # TODO: are Hashables ok? - var = variables.get(name, None) + var = variables.get(name) if var is not None: attrs = var.attrs encoding = var.encoding diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 210cea2c76a..9b636f6fc81 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -84,22 +84,16 @@ def __init__( self.freq = to_offset(freq) self.origin = origin - if isinstance(self.freq, MonthEnd | QuarterEnd | YearEnd): - if closed is None: - self.closed = "right" - else: - self.closed = closed - if label is None: - self.label = "right" - else: - self.label = label - # The backward resample sets ``closed`` to ``'right'`` by default - # since the last value should be considered as the edge point for - # the last bin. When origin in "end" or "end_day", the value for a - # specific ``cftime.datetime`` index stands for the resample result - # from the current ``cftime.datetime`` minus ``freq`` to the current - # ``cftime.datetime`` with a right close. - elif self.origin in ["end", "end_day"]: + if isinstance(self.freq, MonthEnd | QuarterEnd | YearEnd) or self.origin in [ + "end", + "end_day", + ]: + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``cftime.datetime`` index stands for the resample result + # from the current ``cftime.datetime`` minus ``freq`` to the current + # ``cftime.datetime`` with a right close. if closed is None: self.closed = "right" else: diff --git a/xarray/groupers.py b/xarray/groupers.py index 3a27d725116..883a7210a3c 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -701,25 +701,23 @@ def find_independent_seasons(seasons: Sequence[str]) -> Sequence[SeasonsGroup]: grouped = defaultdict(list) codes = defaultdict(list) seen: set[tuple[int, ...]] = set() - idx = 0 # This is quadratic, but the number of seasons is at most 12 for i, current in enumerate(season_inds): # Start with a group if current not in seen: - grouped[idx].append(current) - codes[idx].append(i) + grouped[i].append(current) + codes[i].append(i) seen.add(current) # Loop through remaining groups, and look for overlaps for j, second in enumerate(season_inds[i:]): - if not (set(chain(*grouped[idx])) & set(second)) and second not in seen: - grouped[idx].append(second) - codes[idx].append(j + i) + if not (set(chain(*grouped[i])) & set(second)) and second not in seen: + grouped[i].append(second) + codes[i].append(j + i) seen.add(second) if len(seen) == len(seasons): break - # found all non-overlapping groups for this row, increment and start over - idx += 1 + # found all non-overlapping groups for this row start over grouped_ints = tuple(tuple(idx) for idx in grouped.values() if idx) return [ diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index e64348c7281..c5303e96274 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -549,7 +549,7 @@ def map_plot1d( ) if add_legend: - use_legend_elements = not func.__name__ == "hist" + use_legend_elements = func.__name__ != "hist" if use_legend_elements: self.add_legend( use_legend_elements=use_legend_elements, diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index fc4ca1532e5..70d9954dc0e 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -419,9 +419,10 @@ def _infer_xy_labels( _assert_valid_xy(darray, x, "x") _assert_valid_xy(darray, y, "y") - if darray._indexes.get(x, 1) is darray._indexes.get(y, 2): - if isinstance(darray._indexes[x], PandasMultiIndex): - raise ValueError("x and y cannot be levels of the same MultiIndex") + if darray._indexes.get(x, 1) is darray._indexes.get(y, 2) and isinstance( + darray._indexes[x], PandasMultiIndex + ): + raise ValueError("x and y cannot be levels of the same MultiIndex") return x, y diff --git a/xarray/structure/merge.py b/xarray/structure/merge.py index 403186272b9..5c998075151 100644 --- a/xarray/structure/merge.py +++ b/xarray/structure/merge.py @@ -300,7 +300,7 @@ def merge_collected( variables = [variable for variable, _ in elements_list] try: merged_vars[name] = unique_variable( - name, variables, compat, equals.get(name, None) + name, variables, compat, equals.get(name) ) except MergeError: if compat != "minimal": diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index acb32504948..24ef2c8397e 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -94,8 +94,8 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) -@pytest.mark.parametrize("dtype2", "f4 f8".split()) +@pytest.mark.parametrize("dtype", ["u1", "u2", "i1", "i2", "f2", "f4"]) +@pytest.mark.parametrize("dtype2", ["f4", "f8"]) def test_scaling_converts_to_float(dtype: str, dtype2: str) -> None: dt = np.dtype(dtype2) original = xr.Variable( diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index eefa3c2b4f8..ccb832ee522 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1636,7 +1636,7 @@ def test_normalize_token_with_backend(map_ds): with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file: map_ds.to_netcdf(tmp_file) read = xr.open_dataset(tmp_file) - assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read) + assert dask.base.tokenize(map_ds) != dask.base.tokenize(read) read.close() diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 54cc21b5d2c..731574baac9 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -3235,7 +3235,7 @@ def test_shuffle_simple() -> None: da = xr.DataArray( dims="x", data=dask.array.from_array([1, 2, 3, 4, 5, 6], chunks=2), - coords={"label": ("x", "a b c a b c".split(" "))}, + coords={"label": ("x", ["a", "b", "c", "a", "b", "c"])}, ) actual = da.groupby(label=UniqueGrouper()).shuffle_to_chunks() expected = da.isel(x=[0, 3, 1, 4, 2, 5]) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index d98d72d9876..2dcc5a67f99 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -234,9 +234,7 @@ def convert_units(obj, to): elif isinstance(obj, xr.DataArray): name = obj.name - new_units = ( - to.get(name, None) or to.get("data", None) or to.get(None, None) or None - ) + new_units = to.get(name) or to.get("data") or to.get(None) or None data = convert_units(obj.variable, {None: new_units}) coords = { diff --git a/xarray/util/print_versions.py b/xarray/util/print_versions.py index 8f3be73ee68..0be9713c3dc 100644 --- a/xarray/util/print_versions.py +++ b/xarray/util/print_versions.py @@ -20,7 +20,7 @@ def get_sys_info(): if os.path.isdir(".git") and os.path.isdir("xarray"): try: pipe = subprocess.Popen( - 'git log --format="%H" -n 1'.split(" "), + ("git", "log", '--format="%H"', "-n", "1"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) From e2842aa6c56dc829940ca444a6439467d975fd55 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sat, 5 Jul 2025 15:24:40 +0200 Subject: [PATCH 04/29] Unnecessary generator (#10506) --- xarray/core/indexes.py | 8 +++++--- xarray/core/treenode.py | 2 +- xarray/groupers.py | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 847c3a4edb9..9684f371e00 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -768,10 +768,12 @@ def concat( if not indexes: coord_dtype = None - elif len(set(idx.coord_dtype for idx in indexes)) == 1: - coord_dtype = indexes[0].coord_dtype else: - coord_dtype = np.result_type(*[idx.coord_dtype for idx in indexes]) + indexes_coord_dtypes = {idx.coord_dtype for idx in indexes} + if len(indexes_coord_dtypes) == 1: + coord_dtype = next(iter(indexes_coord_dtypes)) + else: + coord_dtype = np.result_type(*indexes_coord_dtypes) return cls(new_pd_index, dim=dim, coord_dtype=coord_dtype) diff --git a/xarray/core/treenode.py b/xarray/core/treenode.py index df58f7aed6f..a38ec6b2d2b 100644 --- a/xarray/core/treenode.py +++ b/xarray/core/treenode.py @@ -752,7 +752,7 @@ def relative_to(self: NamedNode, other: NamedNode) -> str: ) this_path = NodePath(self.path) - if other.path in list(parent.path for parent in (self, *self.parents)): + if any(other.path == parent.path for parent in (self, *self.parents)): return str(this_path.relative_to(other.path)) else: common_ancestor = self.find_common_ancestor(other) diff --git a/xarray/groupers.py b/xarray/groupers.py index 883a7210a3c..4424c65a94b 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -617,10 +617,10 @@ def season_to_month_tuple(seasons: Sequence[str]) -> tuple[tuple[int, ...], ...] ((12, 1, 2, 3), (9, 10, 11, 12)) """ initials = "JFMAMJJASOND" - starts = dict( - ("".join(s), i + 1) + starts = { + "".join(s): i + 1 for s, i in zip(sliding_window(2, initials + "J"), range(12), strict=True) - ) + } result: list[tuple[int, ...]] = [] for i, season in enumerate(seasons): if len(season) == 1: From 29b9f5a6e9c11b9fcdca71fc91ddf74f774855f3 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sat, 5 Jul 2025 15:25:22 +0200 Subject: [PATCH 05/29] Use is when comparing type of two objects (#10504) --- xarray/plot/utils.py | 2 +- xarray/tests/test_units.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index 70d9954dc0e..a71613562a5 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -1143,7 +1143,7 @@ def _get_color_and_size(value): # Labels are not numerical so modifying label_values is not # possible, instead filter the array with nicely distributed # indexes: - if type(num) == int: # noqa: E721 + if type(num) is int: loc = mpl.ticker.LinearLocator(num) else: raise ValueError("`num` only supports integers for non-numeric labels.") diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 2dcc5a67f99..7635dde3a69 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -5635,7 +5635,7 @@ def test_duck_array_ops(self): assert_units_equal(expected, actual) # Don't use isinstance b/c we don't want to allow subclasses through - assert type(expected.data) == type(actual.data) # noqa: E721 + assert type(expected.data) is type(actual.data) @requires_matplotlib From 4de728171e727d9f2ef41ce4621663d113258893 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sat, 5 Jul 2025 23:32:23 +0200 Subject: [PATCH 06/29] More f-strings, less format() (#10505) --- xarray/backends/common.py | 7 +++---- xarray/computation/apply_ufunc.py | 9 +++++++-- xarray/computation/rolling.py | 2 +- xarray/core/formatting.py | 8 ++++---- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index f478c2b882c..8b56c8a2bf9 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -560,11 +560,10 @@ def _infer_dtype(array, name=None): native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel())) if len(native_dtypes) > 1 and native_dtypes != {bytes, str}: + native_dtype_names = ", ".join(x.__name__ for x in native_dtypes) raise ValueError( - "unable to infer dtype on variable {!r}; object array " - "contains mixed native types: {}".format( - name, ", ".join(x.__name__ for x in native_dtypes) - ) + f"unable to infer dtype on variable {name!r}; object array " + f"contains mixed native types: {native_dtype_names}" ) element = array[(0,) * array.ndim] diff --git a/xarray/computation/apply_ufunc.py b/xarray/computation/apply_ufunc.py index 678c702f3f3..00a06e12d63 100644 --- a/xarray/computation/apply_ufunc.py +++ b/xarray/computation/apply_ufunc.py @@ -141,8 +141,13 @@ def __repr__(self): return f"{type(self).__name__}({list(self.input_core_dims)!r}, {list(self.output_core_dims)!r})" def __str__(self): - lhs = ",".join("({})".format(",".join(dims)) for dims in self.input_core_dims) - rhs = ",".join("({})".format(",".join(dims)) for dims in self.output_core_dims) + comma_separated = ",".join + lhs = comma_separated( + f"({comma_separated(dims)})" for dims in self.input_core_dims + ) + rhs = comma_separated( + f"({comma_separated(dims)})" for dims in self.output_core_dims + ) return f"{lhs}->{rhs}" def to_gufunc_string(self, exclude_dims=frozenset()): diff --git a/xarray/computation/rolling.py b/xarray/computation/rolling.py index e7718560559..adb8a5e6380 100644 --- a/xarray/computation/rolling.py +++ b/xarray/computation/rolling.py @@ -132,7 +132,7 @@ def __repr__(self) -> str: """provide a nice str repr of our rolling object""" attrs = ",".join( - "{k}->{v}{c}".format(k=k, v=w, c="(center)" if c else "") + f"{k}->{w}{'(center)' if c else ''}" for k, w, c in zip(self.dim, self.window, self.center, strict=True) ) return f"{self.__class__.__name__} [{attrs}]" diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 94ff1e65ac0..7eb0841dc27 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -340,17 +340,17 @@ def summarize_variable( first_col = pretty_print(first_col, col_width) if variable.dims: - dims_str = "({}) ".format(", ".join(map(str, variable.dims))) + dims_str = ", ".join(map(str, variable.dims)) + dims_str = f"({dims_str}) " else: dims_str = "" - nbytes_str = f" {render_human_readable_nbytes(variable.nbytes)}" - front_str = f"{first_col}{dims_str}{variable.dtype}{nbytes_str} " + front_str = f"{first_col}{dims_str}{variable.dtype} {render_human_readable_nbytes(variable.nbytes)} " values_width = max_width - len(front_str) values_str = inline_variable_array_repr(variable, values_width) - return front_str + values_str + return f"{front_str}{values_str}" def summarize_attr(key, value, col_width=None): From 90ee30943aedba66a37856b2332a41264e288c20 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Sun, 6 Jul 2025 18:24:45 +0200 Subject: [PATCH 07/29] Unnecessary lambda expression (#10502) --- xarray/core/accessor_str.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 0bab92963a5..f16dbe02f32 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -662,10 +662,11 @@ def format( """ args = tuple(self._stringify(x) for x in args) kwargs = {key: self._stringify(val) for key, val in kwargs.items()} - func = lambda x, *args, **kwargs: self._obj.dtype.type.format( - x, *args, **kwargs + return self._apply( + func=self._obj.dtype.type.format, + func_args=args, + func_kwargs={"kwargs": kwargs}, ) - return self._apply(func=func, func_args=args, func_kwargs={"kwargs": kwargs}) def capitalize(self) -> T_DataArray: """ From edf47aa3e3ed75c0cb4ee5013570784da19a7994 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 19:10:35 +0200 Subject: [PATCH 08/29] Clean-up indexing adapter classes (#10355) --- .github/workflows/benchmarks.yml | 4 +- asv_bench/asv.conf.json | 2 +- asv_bench/benchmarks/repr.py | 28 +++ ci/requirements/environment-benchmark.yml | 23 +++ doc/whats-new.rst | 7 + xarray/core/formatting.py | 11 +- xarray/core/indexing.py | 217 +++++++--------------- xarray/core/variable.py | 10 +- xarray/tests/test_coordinate_transform.py | 11 ++ xarray/tests/test_formatting.py | 43 +++++ 10 files changed, 205 insertions(+), 151 deletions(-) create mode 100644 ci/requirements/environment-benchmark.yml diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e8d411ec927..b1c1a0828aa 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest env: ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment.yml + CONDA_ENV_FILE: ci/requirements/environment-benchmark.yml steps: # We need the full repo to avoid this issue @@ -29,7 +29,7 @@ jobs: with: micromamba-version: "1.5.10-0" environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests + environment-name: xarray-benchmark cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 20c873540de..b377542e402 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -60,7 +60,7 @@ // }, "matrix": { "setuptools_scm": [""], // GH6609 - "numpy": [""], + "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], "scipy": [""], diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py index aa4b6cb7df1..68a082fcc4f 100644 --- a/asv_bench/benchmarks/repr.py +++ b/asv_bench/benchmarks/repr.py @@ -57,3 +57,31 @@ def time_repr(self): def time_repr_html(self): self.da._repr_html_() + + +class ReprPandasRangeIndex: + # display a memory-saving pandas.RangeIndex shouldn't trigger memory + # expensive conversion into a numpy array + def setup(self): + index = xr.indexes.PandasIndex(pd.RangeIndex(1_000_000), "x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() + + +class ReprXarrayRangeIndex: + # display an Xarray RangeIndex shouldn't trigger memory expensive conversion + # of its lazy coordinate into a numpy array + def setup(self): + index = xr.indexes.RangeIndex.arange(1_000_000, dim="x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml new file mode 100644 index 00000000000..0e5c7f4b489 --- /dev/null +++ b/ci/requirements/environment-benchmark.yml @@ -0,0 +1,23 @@ +name: xarray-benchmark +channels: + - conda-forge + - nodefaults +dependencies: + - bottleneck + - cftime + - dask-core + - distributed + - flox + - netcdf4 + - numba + - numbagg + - numexpr + - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105 + - opt_einsum + - packaging + - pandas + - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - sparse + - scipy + - toolz + - zarr diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..99ddb88e691 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,13 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Refactored the ``PandasIndexingAdapter`` and + ``CoordinateTransformIndexingAdapter`` internal indexing classes. Coordinate + variables that wrap a :py:class:`pandas.RangeIndex`, a + :py:class:`pandas.MultiIndex` or a + :py:class:`xarray.indexes.CoordinateTransform` are now displayed as lazy variables + in the Xarray data reprs (:pull:`10355`). + By `Benoit Bovy `_. .. _whats-new.2025.07.0: diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 7eb0841dc27..3a06cf18542 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -20,7 +20,11 @@ from xarray.core.datatree_render import RenderDataTree from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel from xarray.core.extension_array import PandasExtensionArray -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ( + BasicIndexer, + ExplicitlyIndexed, + MemoryCachedArray, +) from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.treenode import group_subtrees from xarray.core.utils import is_duck_array @@ -87,6 +91,8 @@ def first_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -111,6 +117,8 @@ def last_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -659,6 +667,7 @@ def short_array_repr(array): def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data + if isinstance(array, np.ndarray): return short_array_repr(array) elif is_duck_array(internal_data): diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 35278efdeaf..8e4458fb88f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,6 @@ from contextlib import suppress from dataclasses import dataclass, field from datetime import timedelta -from html import escape from typing import TYPE_CHECKING, Any, cast, overload import numpy as np @@ -20,7 +19,6 @@ from xarray.core import duck_array_ops from xarray.core.coordinate_transform import CoordinateTransform from xarray.core.nputils import NumpyVIndexAdapter -from xarray.core.options import OPTIONS from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -1775,10 +1773,25 @@ def __init__( else: self._dtype = np.dtype(cast(DTypeLike, dtype)) + @property + def _in_memory(self) -> bool: + # prevent costly conversion of a memory-saving pd.RangeIndex into a + # large numpy array. + return not isinstance(self.array, pd.RangeIndex) + @property def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype + def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: + if dtype is None: + if is_valid_numpy_dtype(self.dtype): + return cast(np.dtype, self.dtype) + else: + return get_valid_numpy_dtype(self.array) + else: + return np.dtype(dtype) + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -1786,11 +1799,9 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) array = self.array + if isinstance(array, pd.PeriodIndex): with suppress(AttributeError): # this might not be public API @@ -1834,98 +1845,61 @@ def _convert_scalar(self, item) -> np.ndarray: # numpy fails to convert pd.Timestamp to np.datetime64[ns] item = np.asarray(item.to_datetime64()) elif self.dtype != object: - dtype = self.dtype - if pd.api.types.is_extension_array_dtype(dtype): - dtype = get_valid_numpy_dtype(self.array) - item = np.asarray(item, dtype=cast(np.dtype, dtype)) + dtype = self._get_numpy_dtype() + item = np.asarray(item, dtype=dtype) # as for numpy.ndarray indexing, we always want the result to be # a NumPy array. return to_0d_array(item) - def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: - if isinstance(key, tuple) and len(key) == 1: + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) (key,) = key - return key + # if multidimensional key, convert the index to numpy array and index the latter + if getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas index then re-wrap or convert the result + result = self.array[key] - def _handle_result( - self, result: Any - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.oindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "_oindex_get") def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.vindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_vindex_get") def __getitem__( self, indexer: ExplicitIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable[indexer] - - result = self.array[key] - - return self._handle_result(result) + ) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "__getitem__") def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional + def _repr_inline_(self, max_width: int) -> str: + # we want to display values in the inline repr for lazy coordinates too + # (pd.RangeIndex and pd.MultiIndex). `format_array_flat` prevents loading + # the whole array in memory. + from xarray.core.formatting import format_array_flat + + return format_array_flat(self, max_width) + def __repr__(self) -> str: return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" @@ -1944,7 +1918,9 @@ def copy(self, deep: bool = True) -> Self: def nbytes(self) -> int: if pd.api.types.is_extension_array_dtype(self.dtype): return self.array.nbytes - return cast(np.dtype, self.dtype).itemsize * len(self.array) + + dtype = self._get_numpy_dtype() + return dtype.itemsize * len(self.array) class PandasMultiIndexingAdapter(PandasIndexingAdapter): @@ -1977,8 +1953,8 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None: - dtype = cast(np.dtype, self.dtype) + dtype = self._get_numpy_dtype(dtype) + if self.level is not None: return np.asarray( self.array.get_level_values(self.level).values, dtype=dtype @@ -1986,47 +1962,28 @@ def __array__( else: return super().__array__(dtype, copy=copy) - def _convert_scalar(self, item): + @property + def _in_memory(self) -> bool: + # The pd.MultiIndex's data is fully in memory, but it has a different + # layout than the level and dimension coordinate arrays. Marking this + # adapter class as a "lazy" array will prevent costly conversion when, + # e.g., formatting the Xarray reprs. + return False + + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) item = item[idx] return super()._convert_scalar(item) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._oindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def _vindex_get( - self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._vindex_get(indexer) + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + result = super()._index_get(indexer, func_name) if isinstance(result, type(self)): result.level = self.level return result - def __getitem__(self, indexer: ExplicitIndexer): - result = super().__getitem__(indexer) - if isinstance(result, type(self)): - result.level = self.level - - return result - def __repr__(self) -> str: if self.level is None: return super().__repr__() @@ -2036,31 +1993,11 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _get_array_subset(self) -> np.ndarray: - # used to speed-up the repr for big multi-indexes - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) - subset = self[OuterIndexer((indices,))] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - from xarray.core.formatting import format_array_flat - if self.level is None: return "MultiIndex" else: - return format_array_flat(self._get_array_subset(), max_width) - - def _repr_html_(self) -> str: - from xarray.core.formatting import short_array_repr - - array_repr = short_array_repr(self._get_array_subset()) - return f"
{escape(array_repr)}
" + return super()._repr_inline_(max_width=max_width) def copy(self, deep: bool = True) -> Self: # see PandasIndexingAdapter.copy @@ -2097,6 +2034,10 @@ def dtype(self) -> np.dtype: def shape(self) -> tuple[int, ...]: return tuple(self._transform.dim_size.values()) + @property + def _in_memory(self) -> bool: + return False + def get_duck_array(self) -> np.ndarray: all_coords = self._transform.generate_coords(dims=self._dims) return np.asarray(all_coords[self._coord_name]) @@ -2157,23 +2098,9 @@ def transpose(self, order: Iterable[int]) -> Self: def __repr__(self: Any) -> str: return f"{type(self).__name__}(transform={self._transform!r})" - def _get_array_subset(self) -> np.ndarray: - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - flat_indices = np.concatenate( - [np.arange(0, pos), np.arange(self.size - pos, self.size)] - ) - subset = self.vindex[ - VectorizedIndexer(np.unravel_index(flat_indices, self.shape)) - ] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - """Good to see some labels even for a lazy coordinate.""" + # we want to display values in the inline repr for this lazy coordinate + # `format_array_flat` prevents loading the whole array in memory. from xarray.core.formatting import format_array_flat - return format_array_flat(self._get_array_subset(), max_width) + return format_array_flat(self, max_width) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 00d97e868c4..bcc2ca4e460 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -23,6 +23,7 @@ from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( BasicIndexer, + CoordinateTransformIndexingAdapter, OuterIndexer, PandasIndexingAdapter, VectorizedIndexer, @@ -403,10 +404,15 @@ def _new( return cls_(dims_, data, attrs_) @property - def _in_memory(self): + def _in_memory(self) -> bool: + if isinstance( + self._data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter + ): + return self._data._in_memory + return isinstance( self._data, - np.ndarray | np.number | PandasIndexingAdapter | PandasExtensionArray, + np.ndarray | np.number | PandasExtensionArray, ) or ( isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, indexing.NumpyIndexingAdapter) diff --git a/xarray/tests/test_coordinate_transform.py b/xarray/tests/test_coordinate_transform.py index 386ce426998..627063eb8cb 100644 --- a/xarray/tests/test_coordinate_transform.py +++ b/xarray/tests/test_coordinate_transform.py @@ -123,6 +123,17 @@ def test_coordinate_transform_variable_repr_inline() -> None: ) +def test_coordinate_transform_variable_repr() -> None: + var = create_coords(scale=2.0, shape=(2, 2))["x"].variable + + actual = repr(var) + expected = """ + Size: 32B +[4 values with dtype=float64] + """.strip() + assert actual == expected + + def test_coordinate_transform_variable_basic_outer_indexing() -> None: var = create_coords(scale=2.0, shape=(4, 4))["x"].variable diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 88c2c819405..c2ab1144e7b 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1189,3 +1189,46 @@ def test_array_repr_dtypes(): Dimensions without coordinates: x """.strip() assert actual == expected + + +def test_repr_pandas_range_index() -> None: + # lazy data repr but values shown in inline repr + xidx = xr.indexes.PandasIndex(pd.RangeIndex(10), "x") + ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx)) + actual = repr(ds.x) + expected = """ + Size: 80B +[10 values with dtype=int64] +Coordinates: + * x (x) int64 80B 0 1 2 3 4 5 6 7 8 9 + """.strip() + assert actual == expected + + +def test_repr_pandas_multi_index() -> None: + # lazy data repr but values shown in inline repr + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["foo", "bar"]) + coords = xr.Coordinates.from_pandas_multiindex(midx, "x") + ds = xr.Dataset(coords=coords) + + actual = repr(ds.x) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected + + actual = repr(ds.foo) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected From 1a734b783972ae5d9186a5ac9407740d4874b205 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Mon, 7 Jul 2025 19:10:57 +0200 Subject: [PATCH 09/29] Add NDPointIndex (KDTree) (#10478) Co-authored-by: Deepak Cherian --- doc/api.rst | 1 + doc/whats-new.rst | 3 + xarray/core/coordinate_transform.py | 2 +- xarray/indexes/__init__.py | 2 + xarray/indexes/nd_point_index.py | 398 ++++++++++++++++++++++++++++ xarray/tests/test_nd_point_index.py | 183 +++++++++++++ 6 files changed, 588 insertions(+), 1 deletion(-) create mode 100644 xarray/indexes/nd_point_index.py create mode 100644 xarray/tests/test_nd_point_index.py diff --git a/doc/api.rst b/doc/api.rst index df6e87c0cf8..0d722a4bec9 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1577,6 +1577,7 @@ Custom Indexes CFTimeIndex indexes.RangeIndex indexes.CoordinateTransformIndex + indexes.NDPointIndex Creating custom indexes ----------------------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 99ddb88e691..e8b602e9dc9 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -56,6 +56,9 @@ New Features - Expose :py:class:`~xarray.indexes.RangeIndex`, and :py:class:`~xarray.indexes.CoordinateTransformIndex` as public api under the ``xarray.indexes`` namespace. By `Deepak Cherian `_. +- New :py:class:`xarray.indexes.NDPointIndex`, which by default uses :py:class:`scipy.spatial.KDTree` under the hood for + the selection of irregular, n-dimensional data (:pull:`10478`). + By `Benoit Bovy `_. - Support zarr-python's new ``.supports_consolidated_metadata`` store property (:pull:`10457``). by `Tom Nicholas `_. - Better error messages when encoding data to be written to disk fails (:pull:`10464`). diff --git a/xarray/core/coordinate_transform.py b/xarray/core/coordinate_transform.py index 94b3b109e1e..d1e434c3d64 100644 --- a/xarray/core/coordinate_transform.py +++ b/xarray/core/coordinate_transform.py @@ -80,7 +80,7 @@ def equals(self, other: CoordinateTransform, **kwargs) -> bool: Parameters ---------- other : CoordinateTransform - The other Index object to compare with this object. + The other CoordinateTransform object to compare with this object. exclude : frozenset of hashable, optional Dimensions excluded from checking. It is None by default, (i.e., when this method is not called in the context of alignment). For a diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index c53a4b8c2ce..2cba69607f3 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -10,12 +10,14 @@ PandasIndex, PandasMultiIndex, ) +from xarray.indexes.nd_point_index import NDPointIndex from xarray.indexes.range_index import RangeIndex __all__ = [ "CoordinateTransform", "CoordinateTransformIndex", "Index", + "NDPointIndex", "PandasIndex", "PandasMultiIndex", "RangeIndex", diff --git a/xarray/indexes/nd_point_index.py b/xarray/indexes/nd_point_index.py new file mode 100644 index 00000000000..283b8d7d676 --- /dev/null +++ b/xarray/indexes/nd_point_index.py @@ -0,0 +1,398 @@ +from __future__ import annotations + +import abc +from collections.abc import Hashable, Iterable, Mapping +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +import numpy as np + +from xarray.core.dataarray import DataArray +from xarray.core.indexes import Index +from xarray.core.indexing import IndexSelResult +from xarray.core.utils import is_scalar +from xarray.core.variable import Variable +from xarray.structure.alignment import broadcast + +if TYPE_CHECKING: + from scipy.spatial import KDTree + + from xarray.core.types import Self + + +class TreeAdapter(abc.ABC): + """Lightweight adapter abstract class for plugging in 3rd-party structures + like :py:class:`scipy.spatial.KDTree` or :py:class:`sklearn.neighbors.KDTree` + into :py:class:`~xarray.indexes.NDPointIndex`. + + """ + + @abc.abstractmethod + def __init__(self, points: np.ndarray, *, options: Mapping[str, Any]): + """ + Parameters + ---------- + points : ndarray of shape (n_points, n_coordinates) + Two-dimensional array of points/samples (rows) and their + corresponding coordinate labels (columns) to index. + """ + ... + + @abc.abstractmethod + def query(self, points: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Query points. + + Parameters + ---------- + points: ndarray of shape (n_points, n_coordinates) + Two-dimensional array of points/samples (rows) and their + corresponding coordinate labels (columns) to query. + + Returns + ------- + distances : ndarray of shape (n_points) + Distances to the nearest neighbors. + indices : ndarray of shape (n_points) + Indices of the nearest neighbors in the array of the indexed + points. + """ + ... + + def equals(self, other: Self) -> bool: + """Check equality with another TreeAdapter of the same kind. + + Parameters + ---------- + other : + The other TreeAdapter object to compare with this object. + + """ + raise NotImplementedError + + +class ScipyKDTreeAdapter(TreeAdapter): + """:py:class:`scipy.spatial.KDTree` adapter for :py:class:`~xarray.indexes.NDPointIndex`.""" + + _kdtree: KDTree + + def __init__(self, points: np.ndarray, options: Mapping[str, Any]): + from scipy.spatial import KDTree + + self._kdtree = KDTree(points, **options) + + def query(self, points: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + return self._kdtree.query(points) + + def equals(self, other: Self) -> bool: + return np.array_equal(self._kdtree.data, other._kdtree.data) + + +def get_points(coords: Iterable[Variable | Any]) -> np.ndarray: + """Re-arrange data from a sequence of xarray coordinate variables or + labels into a 2-d array of shape (n_points, n_coordinates). + + """ + data = [c.values if isinstance(c, Variable | DataArray) else c for c in coords] + return np.stack([np.ravel(d) for d in data]).T + + +T_TreeAdapter = TypeVar("T_TreeAdapter", bound=TreeAdapter) + + +class NDPointIndex(Index, Generic[T_TreeAdapter]): + """Xarray index for irregular, n-dimensional data. + + This index may be associated with a set of coordinate variables representing + the arbitrary location of data points in an n-dimensional space. All + coordinates must have the same shape and dimensions. The number of + associated coordinate variables must correspond to the number of dimensions + of the space. + + This index supports label-based selection (nearest neighbor lookup). It also + has limited support for alignment. + + By default, this index relies on :py:class:`scipy.spatial.KDTree` for fast + lookup. + + Do not use :py:meth:`~xarray.indexes.NDPointIndex.__init__` directly. Instead + use :py:meth:`xarray.Dataset.set_xindex` or + :py:meth:`xarray.DataArray.set_xindex` to create and set the index from + existing coordinates (see the example below). + + Examples + -------- + An example using a dataset with 2-dimensional coordinates. + + >>> xx = [[1.0, 2.0], [3.0, 0.0]] + >>> yy = [[11.0, 21.0], [29.0, 9.0]] + >>> ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + >>> ds + Size: 64B + Dimensions: (y: 2, x: 2) + Coordinates: + xx (y, x) float64 32B 1.0 2.0 3.0 0.0 + yy (y, x) float64 32B 11.0 21.0 29.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + + Creation of a NDPointIndex from the "xx" and "yy" coordinate variables: + + >>> ds = ds.set_xindex(("xx", "yy"), xr.indexes.NDPointIndex) + >>> ds + Size: 64B + Dimensions: (y: 2, x: 2) + Coordinates: + * xx (y, x) float64 32B 1.0 2.0 3.0 0.0 + * yy (y, x) float64 32B 11.0 21.0 29.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + Indexes: + ┌ xx NDPointIndex (ScipyKDTreeAdapter) + └ yy + + Point-wise (nearest-neighbor) data selection using Xarray's advanced + indexing, i.e., using arbitrary dimension(s) for the Variable objects passed + as labels: + + >>> ds.sel( + ... xx=xr.Variable("points", [1.9, 0.1]), + ... yy=xr.Variable("points", [13.0, 8.0]), + ... method="nearest", + ... ) + Size: 32B + Dimensions: (points: 2) + Coordinates: + xx (points) float64 16B 1.0 0.0 + yy (points) float64 16B 11.0 9.0 + Dimensions without coordinates: points + Data variables: + *empty* + + Data selection with scalar labels: + + >>> ds.sel(xx=1.9, yy=13.0, method="nearest") + Size: 16B + Dimensions: () + Coordinates: + xx float64 8B 1.0 + yy float64 8B 11.0 + Data variables: + *empty* + + Data selection with broadcasting the input labels: + + >>> ds.sel(xx=1.9, yy=xr.Variable("points", [13.0, 8.0]), method="nearest") + Size: 32B + Dimensions: (points: 2) + Coordinates: + xx (points) float64 16B 1.0 0.0 + yy (points) float64 16B 11.0 9.0 + Dimensions without coordinates: points + Data variables: + *empty* + + >>> da = xr.DataArray( + ... [[45.1, 53.3], [65.4, 78.2]], + ... coords={"u": [1.9, 0.1], "v": [13.0, 8.0]}, + ... dims=("u", "v"), + ... ) + >>> ds.sel(xx=da.u, yy=da.v, method="nearest") + Size: 64B + Dimensions: (u: 2, v: 2) + Coordinates: + xx (u, v) float64 32B 1.0 0.0 1.0 0.0 + yy (u, v) float64 32B 11.0 9.0 11.0 9.0 + Dimensions without coordinates: u, v + Data variables: + *empty* + + Data selection with array-like labels (implicit dimensions): + + >>> ds.sel(xx=[[1.9], [0.1]], yy=[[13.0], [8.0]], method="nearest") + Size: 32B + Dimensions: (y: 2, x: 1) + Coordinates: + xx (y, x) float64 16B 1.0 0.0 + yy (y, x) float64 16B 11.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + + """ + + _tree_obj: T_TreeAdapter + _coord_names: tuple[Hashable, ...] + _dims: tuple[Hashable, ...] + _shape: tuple[int, ...] + + def __init__( + self, + tree_obj: T_TreeAdapter, + *, + coord_names: tuple[Hashable, ...], + dims: tuple[Hashable, ...], + shape: tuple[int, ...], + ): + # this constructor is "private" + assert isinstance(tree_obj, TreeAdapter) + self._tree_obj = tree_obj + + assert len(coord_names) == len(dims) == len(shape) + self._coord_names = coord_names + self._dims = dims + self._shape = shape + + @classmethod + def from_variables( + cls, + variables: Mapping[Any, Variable], + *, + options: Mapping[str, Any], + ) -> Self: + if len(set([var.dims for var in variables.values()])) > 1: + var_names = ",".join(vn for vn in variables) + raise ValueError( + f"variables {var_names} must all have the same dimensions and the same shape" + ) + + var0 = next(iter(variables.values())) + + if len(variables) != len(var0.dims): + raise ValueError( + f"the number of variables {len(variables)} doesn't match " + f"the number of dimensions {len(var0.dims)}" + ) + + opts = dict(options) + + tree_adapter_cls: type[T_TreeAdapter] = opts.pop("tree_adapter_cls", None) + if tree_adapter_cls is None: + tree_adapter_cls = ScipyKDTreeAdapter + + points = get_points(variables.values()) + + return cls( + tree_adapter_cls(points, options=opts), + coord_names=tuple(variables), + dims=var0.dims, + shape=var0.shape, + ) + + def create_variables( + self, variables: Mapping[Any, Variable] | None = None + ) -> dict[Any, Variable]: + if variables is not None: + for var in variables.values(): + # maybe re-sync variable dimensions with the index object + # returned by NDPointIndex.rename() + if var.dims != self._dims: + var.dims = self._dims + return dict(**variables) + else: + return {} + + def equals( + self, other: Index, *, exclude: frozenset[Hashable] | None = None + ) -> bool: + if not isinstance(other, NDPointIndex): + return False + if type(self._tree_obj) is not type(other._tree_obj): + return False + return self._tree_obj.equals(other._tree_obj) + + def _get_dim_indexers( + self, + indices: np.ndarray, + label_dims: tuple[Hashable, ...], + label_shape: tuple[int, ...], + ) -> dict[Hashable, Variable]: + """Returns dimension indexers based on the query results (indices) and + the original label dimensions and shape. + + 1. Unravel the flat indices returned from the query + 2. Reshape the unraveled indices according to indexers shapes + 3. Wrap the indices in xarray.Variable objects. + + """ + dim_indexers = {} + + u_indices = list(np.unravel_index(indices.ravel(), self._shape)) + + for dim, ind in zip(self._dims, u_indices, strict=False): + dim_indexers[dim] = Variable(label_dims, ind.reshape(label_shape)) + + return dim_indexers + + def sel( + self, labels: dict[Any, Any], method=None, tolerance=None + ) -> IndexSelResult: + if method != "nearest": + raise ValueError( + "NDPointIndex only supports selection with method='nearest'" + ) + + missing_labels = set(self._coord_names) - set(labels) + if missing_labels: + missing_labels_str = ",".join([f"{name}" for name in missing_labels]) + raise ValueError(f"missing labels for coordinate(s): {missing_labels_str}.") + + # maybe convert labels into xarray DataArray objects + xr_labels: dict[Any, DataArray] = {} + + for name, lbl in labels.items(): + if isinstance(lbl, DataArray): + xr_labels[name] = lbl + elif isinstance(lbl, Variable): + xr_labels[name] = DataArray(lbl) + elif is_scalar(lbl): + xr_labels[name] = DataArray(lbl, dims=()) + elif np.asarray(lbl).ndim == len(self._dims): + xr_labels[name] = DataArray(lbl, dims=self._dims) + else: + raise ValueError( + "invalid label value. NDPointIndex only supports advanced (point-wise) indexing " + "with the following label value kinds:\n" + "- xarray.DataArray or xarray.Variable objects\n" + "- scalar values\n" + "- unlabelled array-like objects with the same number of dimensions " + f"than the {self._coord_names} coordinate variables ({len(self._dims)})" + ) + + # broadcast xarray labels against one another and determine labels shape and dimensions + broadcasted = broadcast(*xr_labels.values()) + label_dims = broadcasted[0].dims + label_shape = broadcasted[0].shape + xr_labels = dict(zip(xr_labels, broadcasted, strict=True)) + + # get and return dimension indexers + points = get_points(xr_labels[name] for name in self._coord_names) + _, indices = self._tree_obj.query(points) + + dim_indexers = self._get_dim_indexers(indices, label_dims, label_shape) + + return IndexSelResult(dim_indexers=dim_indexers) + + def rename( + self, + name_dict: Mapping[Any, Hashable], + dims_dict: Mapping[Any, Hashable], + ) -> Self: + if not set(self._coord_names) & set(name_dict) and not set(self._dims) & set( + dims_dict + ): + return self + + new_coord_names = tuple(name_dict.get(n, n) for n in self._coord_names) + new_dims = tuple(dims_dict.get(d, d) for d in self._dims) + + return type(self)( + self._tree_obj, + coord_names=new_coord_names, + dims=new_dims, + shape=self._shape, + ) + + def _repr_inline_(self, max_width: int) -> str: + tree_obj_type = self._tree_obj.__class__.__name__ + return f"{self.__class__.__name__} ({tree_obj_type})" diff --git a/xarray/tests/test_nd_point_index.py b/xarray/tests/test_nd_point_index.py new file mode 100644 index 00000000000..eb497aa263f --- /dev/null +++ b/xarray/tests/test_nd_point_index.py @@ -0,0 +1,183 @@ +import numpy as np +import pytest + +import xarray as xr +from xarray.indexes import NDPointIndex +from xarray.tests import assert_identical + +pytest.importorskip("scipy") + + +def test_tree_index_init() -> None: + from xarray.indexes.nd_point_index import ScipyKDTreeAdapter + + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + + ds_indexed1 = ds.set_xindex(("xx", "yy"), NDPointIndex) + assert "xx" in ds_indexed1.xindexes + assert "yy" in ds_indexed1.xindexes + assert isinstance(ds_indexed1.xindexes["xx"], NDPointIndex) + assert ds_indexed1.xindexes["xx"] is ds_indexed1.xindexes["yy"] + + ds_indexed2 = ds.set_xindex( + ("xx", "yy"), NDPointIndex, tree_adapter_cls=ScipyKDTreeAdapter + ) + assert ds_indexed1.xindexes["xx"].equals(ds_indexed2.xindexes["yy"]) + + +def test_tree_index_init_errors() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + + with pytest.raises(ValueError, match="number of variables"): + ds.set_xindex("xx", NDPointIndex) + + ds2 = ds.assign_coords(yy=(("u", "v"), [[3.0, 3.0], [4.0, 4.0]])) + + with pytest.raises(ValueError, match="same dimensions"): + ds2.set_xindex(("xx", "yy"), NDPointIndex) + + +def test_tree_index_sel() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + # 1-dimensional labels + actual = ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1, 3.1]), + method="nearest", + ) + expected = xr.Dataset( + coords={"xx": ("u", [1.0, 1.0, 1.0]), "yy": ("u", [3.0, 3.0, 3.0])} + ) + assert_identical(actual, expected) + + # 2-dimensional labels + actual = ds.sel( + xx=xr.Variable(("u", "v"), [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + yy=xr.Variable(("u", "v"), [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + expected = xr.Dataset( + coords={ + "xx": (("u", "v"), [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]), + "yy": (("u", "v"), [[3.0, 3.0, 3.0], [4.0, 4.0, 4.0]]), + }, + ) + assert_identical(actual, expected) + + # all scalar labels + actual = ds.sel(xx=1.1, yy=3.1, method="nearest") + expected = xr.Dataset(coords={"xx": 1.0, "yy": 3.0}) + assert_identical(actual, expected) + + # broadcast scalar to label shape and dimensions + actual = ds.sel(xx=1.1, yy=xr.Variable("u", [3.1, 3.1, 3.1]), method="nearest") + expected = ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1, 3.1]), + method="nearest", + ) + assert_identical(actual, expected) + + # broadcast orthogonal 1-dimensional labels + actual = ds.sel( + xx=xr.Variable("u", [1.1, 1.1]), + yy=xr.Variable("v", [3.1, 3.1]), + method="nearest", + ) + expected = xr.Dataset( + coords={ + "xx": (("u", "v"), [[1.0, 1.0], [1.0, 1.0]]), + "yy": (("u", "v"), [[3.0, 3.0], [3.0, 3.0]]), + }, + ) + assert_identical(actual, expected) + + # implicit dimension array-like labels + actual = ds.sel( + xx=[[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]], + yy=[[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]], + method="nearest", + ) + expected = ds.sel( + xx=xr.Variable(ds.xx.dims, [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + yy=xr.Variable(ds.yy.dims, [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + assert_identical(actual, expected) + + +def test_tree_index_sel_errors() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + with pytest.raises(ValueError, match="method='nearest'"): + ds.sel(xx=1.1, yy=3.1) + + with pytest.raises(ValueError, match="missing labels"): + ds.sel(xx=1.1, method="nearest") + + with pytest.raises(ValueError, match="invalid label value"): + # invalid array-like dimensions + ds.sel(xx=[1.1, 1.9], yy=[3.1, 3.9], method="nearest") + + # error while trying to broadcast labels + with pytest.raises(xr.AlignmentError, match=".*conflicting dimension sizes"): + ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1]), + method="nearest", + ) + + +def test_tree_index_equals() -> None: + xx1, yy1 = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds1 = xr.Dataset( + coords={"xx": (("y", "x"), xx1), "yy": (("y", "x"), yy1)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + xx2, yy2 = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds2 = xr.Dataset( + coords={"xx": (("y", "x"), xx2), "yy": (("y", "x"), yy2)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + xx3, yy3 = np.meshgrid([10.0, 20.0], [30.0, 40.0]) + ds3 = xr.Dataset( + coords={"xx": (("y", "x"), xx3), "yy": (("y", "x"), yy3)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + assert ds1.xindexes["xx"].equals(ds2.xindexes["xx"]) + assert not ds1.xindexes["xx"].equals(ds3.xindexes["xx"]) + + +def test_tree_index_rename() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + ds_renamed = ds.rename_dims(y="u").rename_vars(yy="uu") + assert "uu" in ds_renamed.xindexes + assert isinstance(ds_renamed.xindexes["uu"], NDPointIndex) + assert ds_renamed.xindexes["xx"] is ds_renamed.xindexes["uu"] + + # test via sel() with implicit dimension array-like labels, which relies on + # NDPointIndex._coord_names and NDPointIndex._dims internal attrs + actual = ds_renamed.sel( + xx=[[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]], + uu=[[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]], + method="nearest", + ) + expected = ds_renamed.sel( + xx=xr.Variable(ds_renamed.xx.dims, [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + uu=xr.Variable(ds_renamed.uu.dims, [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + assert_identical(actual, expected) From 35f34a5c0d6b653ef4fb0c0a32069b0e54f47ce4 Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Mon, 7 Jul 2025 21:56:50 +0200 Subject: [PATCH 10/29] Use a set comprehension (#10509) --- xarray/indexes/nd_point_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/indexes/nd_point_index.py b/xarray/indexes/nd_point_index.py index 283b8d7d676..95af1dd0952 100644 --- a/xarray/indexes/nd_point_index.py +++ b/xarray/indexes/nd_point_index.py @@ -250,7 +250,7 @@ def from_variables( *, options: Mapping[str, Any], ) -> Self: - if len(set([var.dims for var in variables.values()])) > 1: + if len({var.dims for var in variables.values()}) > 1: var_names = ",".join(vn for vn in variables) raise ValueError( f"variables {var_names} must all have the same dimensions and the same shape" From ceaca47f25c990fcbb1e33e8a23198805604714d Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Mon, 7 Jul 2025 22:39:41 +0200 Subject: [PATCH 11/29] Update pre-commit ruff legacy alias (#10511) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e7d5a8567c7..ee6c9bed568 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,9 +26,9 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.1 hooks: - - id: ruff-format - - id: ruff + - id: ruff-check args: ["--fix", "--show-fixes"] + - id: ruff-format - repo: https://github.com/keewis/blackdoc rev: v0.3.9 hooks: From 22bd964dca96226b2ed0b57762976743a9fe9e25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Jul 2025 15:45:16 -0700 Subject: [PATCH 12/29] Update pre-commit hooks (#10510) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update pre-commit hooks updates: - [github.com/astral-sh/ruff-pre-commit: v0.12.1 → v0.12.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.12.1...v0.12.2) - [github.com/keewis/blackdoc: v0.3.9 → v0.4.1](https://github.com/keewis/blackdoc/compare/v0.3.9...v0.4.1) - [github.com/rbubley/mirrors-prettier: v3.5.3 → v3.6.2](https://github.com/rbubley/mirrors-prettier/compare/v3.5.3...v3.6.2) - [github.com/pre-commit/mirrors-mypy: v1.16.0 → v1.16.1](https://github.com/pre-commit/mirrors-mypy/compare/v1.16.0...v1.16.1) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unbalanced paren * remove indents --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Justus Magin --- .pre-commit-config.yaml | 8 +- HOW_TO_RELEASE.md | 2 - asv_bench/benchmarks/README_CI.md | 2 + design_notes/flexible_indexes_notes.md | 20 +-- design_notes/grouper_objects.md | 8 +- design_notes/named_array_design_doc.md | 187 ++++++++++++------------- doc/contribute/contributing.rst | 2 + 7 files changed, 114 insertions(+), 115 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ee6c9bed568..eef1cc97da2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,24 +24,24 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.1 + rev: v0.12.2 hooks: - id: ruff-check args: ["--fix", "--show-fixes"] - id: ruff-format - repo: https://github.com/keewis/blackdoc - rev: v0.3.9 + rev: v0.4.1 hooks: - id: blackdoc exclude: "generate_aggregations.py" additional_dependencies: ["black==24.8.0"] - repo: https://github.com/rbubley/mirrors-prettier - rev: v3.5.3 + rev: v3.6.2 hooks: - id: prettier args: [--cache-location=.prettier_cache/cache] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.16.0 + rev: v1.16.1 hooks: - id: mypy # Copied from setup.cfg diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index 15be8c5d0f4..786ef8f2f18 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -110,7 +110,6 @@ upstream https://github.com/pydata/xarray (push) (Note that repo branch restrictions prevent pushing to `main`, so you have to just-self-merge this.) 13. Update the version available on pyodide: - - Open the PyPI page for [Xarray downloads](https://pypi.org/project/xarray/#files) - Edit [`pyodide/packages/xarray/meta.yaml`](https://github.com/pyodide/pyodide/blob/main/packages/xarray/meta.yaml) to update the - version number @@ -121,7 +120,6 @@ upstream https://github.com/pydata/xarray (push) 14. Issue the release announcement to mailing lists & Twitter (X). For bug fix releases, I usually only email xarray@googlegroups.com. For major/feature releases, I will email a broader list (no more than once every 3-6 months): - - pydata@googlegroups.com - xarray@googlegroups.com - numpy-discussion@scipy.org diff --git a/asv_bench/benchmarks/README_CI.md b/asv_bench/benchmarks/README_CI.md index 9c35e8a93b2..8461b5cd548 100644 --- a/asv_bench/benchmarks/README_CI.md +++ b/asv_bench/benchmarks/README_CI.md @@ -115,8 +115,10 @@ To minimize the time required to run the full suite, we trimmed the parameter ma ```python from . import _skip_slow # this function is defined in benchmarks.__init__ + def time_something_slow(): pass + time_something.setup = _skip_slow ``` diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md index 382911c18de..2a3a1cccc40 100644 --- a/design_notes/flexible_indexes_notes.md +++ b/design_notes/flexible_indexes_notes.md @@ -97,12 +97,12 @@ The new `indexes` argument of Dataset/DataArray constructors may be used to spec ```python >>> da = xr.DataArray( ... data=[[275.2, 273.5], [270.8, 278.6]], -... dims=('x', 'y'), +... dims=("x", "y"), ... coords={ -... 'lat': (('x', 'y'), [[45.6, 46.5], [50.2, 51.6]]), -... 'lon': (('x', 'y'), [[5.7, 10.5], [6.2, 12.8]]), +... "lat": (("x", "y"), [[45.6, 46.5], [50.2, 51.6]]), +... "lon": (("x", "y"), [[5.7, 10.5], [6.2, 12.8]]), ... }, -... indexes={('lat', 'lon'): SpatialIndex}, +... indexes={("lat", "lon"): SpatialIndex}, ... ) array([[275.2, 273.5], @@ -120,7 +120,7 @@ More formally, `indexes` would accept `Mapping[CoordinateNames, IndexSpec]` wher Currently index objects like `pandas.MultiIndex` can be passed directly to `coords`, which in this specific case results in the implicit creation of virtual coordinates. With the new `indexes` argument this behavior may become even more confusing than it currently is. For the sake of clarity, it would be appropriate to eventually drop support for this specific behavior and treat any given mapping value given in `coords` as an array that can be wrapped into an Xarray variable, i.e., in the case of a multi-index: ```python ->>> xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) +>>> xr.DataArray([1.0, 2.0], dims="x", coords={"x": midx}) array([1., 2.]) Coordinates: @@ -169,8 +169,8 @@ Like for the indexes, explicit coordinate creation should be preferred over impl For example, it is currently possible to pass a `pandas.MultiIndex` object as a coordinate to the Dataset/DataArray constructor: ```python ->>> midx = pd.MultiIndex.from_arrays([['a', 'b'], [0, 1]], names=['lvl1', 'lvl2']) ->>> da = xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) +>>> midx = pd.MultiIndex.from_arrays([["a", "b"], [0, 1]], names=["lvl1", "lvl2"]) +>>> da = xr.DataArray([1.0, 2.0], dims="x", coords={"x": midx}) >>> da array([1., 2.]) @@ -201,7 +201,9 @@ Besides `pandas.MultiIndex`, there may be other situations where we would like t The example given here is quite confusing, though: this is not an easily predictable behavior. We could entirely avoid the implicit creation of coordinates, e.g., using a helper function that generates coordinate + index dictionaries that we could then pass directly to the DataArray/Dataset constructor: ```python ->>> coords_dict, index_dict = create_coords_from_index(midx, dims='x', include_dim_coord=True) +>>> coords_dict, index_dict = create_coords_from_index( +... midx, dims="x", include_dim_coord=True +... ) >>> coords_dict {'x': array([('a', 0), ('b', 1)], dtype=object), @@ -211,7 +213,7 @@ The example given here is quite confusing, though: this is not an easily predict array([0, 1])} >>> index_dict {('lvl1', 'lvl2'): midx} ->>> xr.DataArray([1.0, 2.0], dims='x', coords=coords_dict, indexes=index_dict) +>>> xr.DataArray([1.0, 2.0], dims="x", coords=coords_dict, indexes=index_dict) array([1., 2.]) Coordinates: diff --git a/design_notes/grouper_objects.md b/design_notes/grouper_objects.md index ca6f099377f..f702dc17d0b 100644 --- a/design_notes/grouper_objects.md +++ b/design_notes/grouper_objects.md @@ -8,7 +8,7 @@ I propose the addition of Grouper objects to Xarray's public API so that ```python -Dataset.groupby(x=BinGrouper(bins=np.arange(10, 2)))) +Dataset.groupby(x=BinGrouper(bins=np.arange(10, 2))) ``` is identical to today's syntax: @@ -27,7 +27,7 @@ results = [] for element in unique_labels: subset = ds.sel(x=(ds.x == element)) # split # subset = ds.where(ds.x == element, drop=True) # alternative - result = subset.mean() # apply + result = subset.mean() # apply results.append(result) xr.concat(results) # combine @@ -36,7 +36,7 @@ xr.concat(results) # combine to ```python -ds.groupby('x').mean() # splits, applies, and combines +ds.groupby("x").mean() # splits, applies, and combines ``` Efficient vectorized implementations of this pattern are implemented in numpy's [`ufunc.at`](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.at.html), [`ufunc.reduceat`](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.reduceat.html), [`numbagg.grouped`](https://github.com/numbagg/numbagg/blob/main/numbagg/grouped.py), [`numpy_groupies`](https://github.com/ml31415/numpy-groupies), and probably more. @@ -110,11 +110,13 @@ All Grouper objects will subclass from a Grouper object ```python import abc + class Grouper(abc.ABC): @abc.abstractmethod def factorize(self, by: DataArray): raise NotImplementedError + class CustomGrouper(Grouper): def factorize(self, by: DataArray): ... diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md index 455ba72ef87..3c331c76f71 100644 --- a/design_notes/named_array_design_doc.md +++ b/design_notes/named_array_design_doc.md @@ -75,7 +75,6 @@ The named-array package is designed to be interoperable with other scientific Py - Delete the ExplicitIndexer objects (`BasicIndexer`, `VectorizedIndexer`, `OuterIndexer`) - Remove explicit support for `pd.Index`. When provided with a `pd.Index` object, Variable will coerce to an array using `np.array(pd.Index)`. For Xarray's purposes, Xarray can use `as_variable` to explicitly wrap these in PandasIndexingAdapter and pass them to `Variable.__init__`. 3. Define a minimal variable interface that the rest of Xarray can use: - 1. `dims`: tuple of dimension names 2. `data`: numpy/dask/duck arrays` 3. `attrs``: dictionary of attributes @@ -194,134 +193,132 @@ Questions: ```python # Sorting - Variable.argsort - Variable.searchsorted +Variable.argsort +Variable.searchsorted # NaN handling - Variable.fillna - Variable.isnull - Variable.notnull +Variable.fillna +Variable.isnull +Variable.notnull # Lazy data handling - Variable.chunk # Could instead have accessor interface and recommend users use `Variable.dask.chunk` and `Variable.cubed.chunk`? - Variable.to_numpy() - Variable.as_numpy() +Variable.chunk # Could instead have accessor interface and recommend users use `Variable.dask.chunk` and `Variable.cubed.chunk`? +Variable.to_numpy() +Variable.as_numpy() # Xarray-specific - Variable.get_axis_num - Variable.isel - Variable.to_dict +Variable.get_axis_num +Variable.isel +Variable.to_dict # Reductions - Variable.reduce - Variable.all - Variable.any - Variable.argmax - Variable.argmin - Variable.count - Variable.max - Variable.mean - Variable.median - Variable.min - Variable.prod - Variable.quantile - Variable.std - Variable.sum - Variable.var +Variable.reduce +Variable.all +Variable.any +Variable.argmax +Variable.argmin +Variable.count +Variable.max +Variable.mean +Variable.median +Variable.min +Variable.prod +Variable.quantile +Variable.std +Variable.sum +Variable.var # Accumulate - Variable.cumprod - Variable.cumsum +Variable.cumprod +Variable.cumsum # numpy-like Methods - Variable.astype - Variable.copy - Variable.clip - Variable.round - Variable.item - Variable.where +Variable.astype +Variable.copy +Variable.clip +Variable.round +Variable.item +Variable.where # Reordering/Reshaping - Variable.squeeze - Variable.pad - Variable.roll - Variable.shift - +Variable.squeeze +Variable.pad +Variable.roll +Variable.shift ``` #### methods to be renamed from xarray.Variable ```python # Xarray-specific - Variable.concat # create two functions, one as the equivalent of `np.stack` and other for `np.concat` +Variable.concat # create two functions, one as the equivalent of `np.stack` and other for `np.concat` - # Given how niche these are, these would be better as functions than methods. - # We could also keep these in Xarray, at least for now. If we don't think people will use functionality outside of Xarray it probably is not worth the trouble of porting it (including documentation, etc). - Variable.coarsen # This should probably be called something like coarsen_reduce. - Variable.coarsen_reshape - Variable.rolling_window +# Given how niche these are, these would be better as functions than methods. +# We could also keep these in Xarray, at least for now. If we don't think people will use functionality outside of Xarray it probably is not worth the trouble of porting it (including documentation, etc). +Variable.coarsen # This should probably be called something like coarsen_reduce. +Variable.coarsen_reshape +Variable.rolling_window - Variable.set_dims # split this into broadcast_to and expand_dims +Variable.set_dims # split this into broadcast_to and expand_dims # Reordering/Reshaping - Variable.stack # To avoid confusion with np.stack, let's call this stack_dims. - Variable.transpose # Could consider calling this permute_dims, like the [array API standard](https://data-apis.org/array-api/2022.12/API_specification/manipulation_functions.html#objects-in-api) - Variable.unstack # Likewise, maybe call this unstack_dims? +Variable.stack # To avoid confusion with np.stack, let's call this stack_dims. +Variable.transpose # Could consider calling this permute_dims, like the [array API standard](https://data-apis.org/array-api/2022.12/API_specification/manipulation_functions.html#objects-in-api) +Variable.unstack # Likewise, maybe call this unstack_dims? ``` #### methods to be removed from xarray.Variable ```python # Testing - Variable.broadcast_equals - Variable.equals - Variable.identical - Variable.no_conflicts +Variable.broadcast_equals +Variable.equals +Variable.identical +Variable.no_conflicts # Lazy data handling - Variable.compute # We can probably omit this method for now, too, given that dask.compute() uses a protocol. The other concern is that different array libraries have different notions of "compute" and this one is rather Dask specific, including conversion from Dask to NumPy arrays. For example, in JAX every operation executes eagerly, but in a non-blocking fashion, and you need to call jax.block_until_ready() to ensure computation is finished. - Variable.load # Could remove? compute vs load is a common source of confusion. +Variable.compute # We can probably omit this method for now, too, given that dask.compute() uses a protocol. The other concern is that different array libraries have different notions of "compute" and this one is rather Dask specific, including conversion from Dask to NumPy arrays. For example, in JAX every operation executes eagerly, but in a non-blocking fashion, and you need to call jax.block_until_ready() to ensure computation is finished. +Variable.load # Could remove? compute vs load is a common source of confusion. # Xarray-specific - Variable.to_index - Variable.to_index_variable - Variable.to_variable - Variable.to_base_variable - Variable.to_coord +Variable.to_index +Variable.to_index_variable +Variable.to_variable +Variable.to_base_variable +Variable.to_coord - Variable.rank # Uses bottleneck. Delete? Could use https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html instead +Variable.rank # Uses bottleneck. Delete? Could use https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html instead # numpy-like Methods - Variable.conjugate # .conj is enough - Variable.__array_wrap__ # This is a very old NumPy protocol for duck arrays. We don't need it now that we have `__array_ufunc__` and `__array_function__` +Variable.conjugate # .conj is enough +Variable.__array_wrap__ # This is a very old NumPy protocol for duck arrays. We don't need it now that we have `__array_ufunc__` and `__array_function__` # Encoding - Variable.reset_encoding - +Variable.reset_encoding ``` #### Attributes to be preserved from xarray.Variable ```python # Properties - Variable.attrs - Variable.chunks - Variable.data - Variable.dims - Variable.dtype - - Variable.nbytes - Variable.ndim - Variable.shape - Variable.size - Variable.sizes - - Variable.T - Variable.real - Variable.imag - Variable.conj +Variable.attrs +Variable.chunks +Variable.data +Variable.dims +Variable.dtype + +Variable.nbytes +Variable.ndim +Variable.shape +Variable.size +Variable.sizes + +Variable.T +Variable.real +Variable.imag +Variable.conj ``` #### Attributes to be renamed from xarray.Variable @@ -333,12 +330,10 @@ Questions: #### Attributes to be removed from xarray.Variable ```python - - Variable.values # Probably also remove -- this is a legacy from before Xarray supported dask arrays. ".data" is enough. +Variable.values # Probably also remove -- this is a legacy from before Xarray supported dask arrays. ".data" is enough. # Encoding - Variable.encoding - +Variable.encoding ``` ### Appendix: Implementation Details @@ -347,17 +342,16 @@ Questions: ```python class VariableArithmetic( - ImplementsArrayReduce, - IncludeReduceMethods, - IncludeCumMethods, - IncludeNumpySameMethods, - SupportsArithmetic, - VariableOpsMixin, + ImplementsArrayReduce, + IncludeReduceMethods, + IncludeCumMethods, + IncludeNumpySameMethods, + SupportsArithmetic, + VariableOpsMixin, ): - __slots__ = () - # prioritize our operations over those of numpy.ndarray (priority=0) - __array_priority__ = 50 - + __slots__ = () + # prioritize our operations over those of numpy.ndarray (priority=0) + __array_priority__ = 50 ``` - Move over `_typed_ops.VariableOpsMixin` @@ -369,7 +363,6 @@ class VariableArithmetic( - The Variable constructor will need to be rewritten to no longer accept tuples, encodings, etc. These details should be handled at the Xarray data structure level. - What happens to `duck_array_ops?` - What about Variable.chunk and "chunk managers"? - - Could this functionality be left in Xarray proper for now? Alternative array types like JAX also have some notion of "chunks" for parallel arrays, but the details differ in a number of ways from the Dask/Cubed. - Perhaps variable.chunk/load methods should become functions defined in xarray that convert Variable objects. This is easy so long as xarray can reach in and replace .data diff --git a/doc/contribute/contributing.rst b/doc/contribute/contributing.rst index 339050a7f8a..6afd844f84b 100644 --- a/doc/contribute/contributing.rst +++ b/doc/contribute/contributing.rst @@ -72,6 +72,7 @@ If you are reporting a bug, please use the provided template which includes the ```python import xarray as xr + ds = xr.Dataset(...) ... @@ -82,6 +83,7 @@ If you are reporting a bug, please use the provided template which includes the ```python import xarray as xr + xr.show_versions() ... From 37dbae129071b116d18dfee955c497bf297395b2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 8 Jul 2025 15:39:12 +0200 Subject: [PATCH 13/29] doc: add Pandas(Multi)Index to api reference (#10515) --- doc/api.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 0d722a4bec9..c578919dcce 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1704,6 +1704,9 @@ Advanced API Default, pandas-backed indexes built-in Xarray: +.. autosummary:: + :toctree: generated/ + indexes.PandasIndex indexes.PandasMultiIndex From 3679a5df71ccaf3a37a62bcd483870d5d1ae9da2 Mon Sep 17 00:00:00 2001 From: Benoit Bovy Date: Tue, 8 Jul 2025 21:20:19 +0200 Subject: [PATCH 14/29] Allow setting (or skipping) new indexes in open_dataset (#8051) Co-authored-by: Deepak Cherian Co-authored-by: Tom Nicholas Co-authored-by: Justus Magin Co-authored-by: Justus Magin Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 3 +- xarray/backends/api.py | 69 +++++++++++++++++++++++++++---- xarray/backends/store.py | 17 +++++++- xarray/backends/zarr.py | 9 ++++ xarray/tests/test_backends.py | 61 +++++++++++++++++++++++++++ xarray/tests/test_backends_api.py | 36 ++++++++++++++++ 6 files changed, 185 insertions(+), 10 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e8b602e9dc9..1de857032d0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,7 +12,8 @@ v2025.07.1 (unreleased) New Features ~~~~~~~~~~~~ - +- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). + By `Benoit Bovy `_ and `Justus Magin `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..cfd3ff7fc0f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -379,6 +380,15 @@ def _chunk_ds( return backend_ds._replace(variables) +def _maybe_create_default_indexes(ds): + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -389,6 +399,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +408,15 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds + + if create_default_indexes: + ds = _maybe_create_default_indexes(backend_ds) else: + ds = backend_ds + + if chunks is not None: ds = _chunk_ds( - backend_ds, + ds, filename_or_obj, engine, chunks, @@ -434,6 +449,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -442,9 +458,11 @@ def _datatree_from_backend_datatree( ) _protect_datatree_variables_inplace(backend_tree, cache) - if chunks is None: - tree = backend_tree + if create_default_indexes: + tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) else: + tree = backend_tree + if chunks is not None: tree = DataTree.from_dict( { path: _chunk_ds( @@ -459,11 +477,12 @@ def _datatree_from_backend_datatree( node=path, **extra_tokens, ) - for path, [node] in group_subtrees(backend_tree) + for path, [node] in group_subtrees(tree) }, - name=backend_tree.name, + name=tree.name, ) + if create_default_indexes or chunks is not None: for path, [node] in group_subtrees(backend_tree): tree[path].set_close(node._close) @@ -497,6 +516,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -610,6 +630,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -702,6 +729,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -725,6 +753,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -833,6 +862,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -890,6 +926,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, @@ -946,6 +983,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1055,6 +1093,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1148,6 +1193,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -1175,6 +1221,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1286,6 +1333,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1381,6 +1435,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -36,6 +37,7 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -56,8 +58,19 @@ def open_dataset( decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars, indexes={}) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 48405b906cd..8b26a6b40ec 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,6 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1457,6 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1513,6 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..a9063c4dcc9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import module_available @@ -2066,6 +2067,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -4063,6 +4084,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6434,6 +6475,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert len(final.xindexes) == 0 + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords) From d85185bc8cf2fdca7c7143d4cda360514bf5a0a6 Mon Sep 17 00:00:00 2001 From: Dhruva Kumar Kaushal <120594589+dhruvak001@users.noreply.github.com> Date: Wed, 9 Jul 2025 07:44:27 -0700 Subject: [PATCH 15/29] Raise if `Index.create_variables` returns more variables than passed in through `set_xindex` (#10503) Co-authored-by: DHRUVA KUMAR KAUSHAL Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/whats-new.rst | 6 +++++ xarray/core/dataset.py | 14 ++++++++++ xarray/tests/test_indexes.py | 51 ++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1de857032d0..f97f12ffc9f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -26,6 +26,12 @@ Deprecations Bug fixes ~~~~~~~~~ +- :py:meth:`Dataset.set_xindex` now raises a helpful error when a custom index + creates extra variables that don't match the provided coordinate names, instead + of silently ignoring them. The error message suggests using the factory method + pattern with :py:meth:`xarray.Coordinates.from_xindex` and + :py:meth:`Dataset.assign_coords` for advanced use cases (:issue:`10499`). + By `Dhruva Kumar Kaushal `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6de626a159b..ac4bfc32df5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4940,6 +4940,20 @@ def set_xindex( if isinstance(index, PandasMultiIndex): coord_names = [index.dim] + list(coord_names) + # Check for extra variables that don't match the coordinate names + extra_vars = set(new_coord_vars) - set(coord_names) + if extra_vars: + extra_vars_str = ", ".join(f"'{name}'" for name in extra_vars) + coord_names_str = ", ".join(f"'{name}'" for name in coord_names) + raise ValueError( + f"The index created extra variables {extra_vars_str} that are not " + f"in the list of coordinates {coord_names_str}. " + f"Use a factory method pattern instead:\n" + f" index = {index_cls.__name__}.from_variables(ds, {list(coord_names)!r})\n" + f" coords = xr.Coordinates.from_xindex(index)\n" + f" ds = ds.assign_coords(coords)" + ) + variables: dict[Hashable, Variable] indexes: dict[Hashable, Index] diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 2b7900d9c89..9f2eea48260 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -729,3 +729,54 @@ def test_restore_dtype_on_multiindexes(dtype: str) -> None: foo = xr.Dataset(coords={"bar": ("bar", np.array([0, 1], dtype=dtype))}) foo = foo.stack(baz=("bar",)) assert str(foo["bar"].values.dtype) == dtype + + +class IndexWithExtraVariables(Index): + @classmethod + def from_variables(cls, variables, *, options=None): + return cls() + + def create_variables(self, variables=None): + if variables is None: + # For Coordinates.from_xindex(), return all variables the index can create + return { + "time": Variable(dims=("time",), data=[1, 2, 3]), + "valid_time": Variable( + dims=("time",), + data=[2, 3, 4], # time + 1 + attrs={"description": "time + 1"}, + ), + } + + result = dict(variables) + if "time" in variables: + result["valid_time"] = Variable( + dims=("time",), + data=variables["time"].data + 1, + attrs={"description": "time + 1"}, + ) + return result + + +def test_set_xindex_with_extra_variables() -> None: + """Test that set_xindex raises an error when custom index creates extra variables.""" + + ds = xr.Dataset(coords={"time": [1, 2, 3]}).reset_index("time") + + # Test that set_xindex raises error for extra variables + with pytest.raises(ValueError, match="extra variables 'valid_time'"): + ds.set_xindex("time", IndexWithExtraVariables) + + +def test_set_xindex_factory_method_pattern() -> None: + ds = xr.Dataset(coords={"time": [1, 2, 3]}).reset_index("time") + + # Test the recommended factory method pattern + coord_vars = {"time": ds._variables["time"]} + index = IndexWithExtraVariables.from_variables(coord_vars) + coords = xr.Coordinates.from_xindex(index) + result = ds.assign_coords(coords) + + assert "time" in result.variables + assert "valid_time" in result.variables + assert_array_equal(result.valid_time.data, result.time.data + 1) From c43a3749b3ae0136d759c550ccc56428660bbc7a Mon Sep 17 00:00:00 2001 From: Dimitri Papadopoulos Orfanos <3234522+DimitriPapadopoulos@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:44:57 +0200 Subject: [PATCH 16/29] Expression not assigned (#10507) --- xarray/coding/variables.py | 10 +++------- xarray/tests/test_dask.py | 3 ++- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 3b7be898ccf..eff08c74500 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -164,10 +164,8 @@ def _check_fill_values(attrs, name, dtype): Issue SerializationWarning if appropriate. """ raw_fill_dict = {} - [ + for attr in ("missing_value", "_FillValue"): pop_to(attrs, raw_fill_dict, attr, name=name) - for attr in ("missing_value", "_FillValue") - ] encoded_fill_values = set() for k in list(raw_fill_dict): v = raw_fill_dict[k] @@ -376,11 +374,9 @@ def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) - # Even if _Unsigned is use, retain on-disk _FillValue - [ + # Even if _Unsigned is used, retain on-disk _FillValue + for attr, value in raw_fill_dict.items(): safe_setitem(encoding, attr, value, name=name) - for attr, value in raw_fill_dict.items() - ] if "_Unsigned" in attrs: unsigned = pop_to(attrs, encoding, "_Unsigned") diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index ccb832ee522..9024f2ae677 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1129,7 +1129,8 @@ def test_unify_chunks(map_ds): def test_unify_chunks_shallow_copy(obj, transform): obj = transform(obj) unified = obj.unify_chunks() - assert_identical(obj, unified) and obj is not obj.unify_chunks() + assert_identical(obj, unified) + # assert obj is not unified @pytest.mark.parametrize("obj", [make_da()]) From 271ebe957990eb5c9cfb65c001ac440e0323af36 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 9 Jul 2025 10:07:55 -0700 Subject: [PATCH 17/29] Update Custom Indexes section in api.rst (#10517) --- doc/api-hidden.rst | 16 ------ doc/api.rst | 80 ++++++++++++++++++++--------- xarray/core/coordinate_transform.py | 5 +- xarray/core/dataarray.py | 2 +- xarray/core/dataset.py | 2 +- xarray/core/indexes.py | 5 +- 6 files changed, 65 insertions(+), 45 deletions(-) diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 9a6037cf3c4..5b9fa70d6b7 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -515,22 +515,6 @@ CFTimeIndex.values CFTimeIndex.year - Index.from_variables - Index.concat - Index.stack - Index.unstack - Index.create_variables - Index.should_add_coord_to_array - Index.to_pandas_index - Index.isel - Index.sel - Index.join - Index.reindex_like - Index.equals - Index.roll - Index.rename - Index.copy - indexes.RangeIndex.start indexes.RangeIndex.stop indexes.RangeIndex.step diff --git a/doc/api.rst b/doc/api.rst index c578919dcce..f4a6dc6677d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -978,6 +978,40 @@ and DataTree objects, respectively. core.coordinates.DataArrayCoordinates core.coordinates.DataTreeCoordinates +Indexes +======= + +Default, pandas-backed indexes built-in to Xarray: + +.. autosummary:: + :toctree: generated/ + + indexes.PandasIndex + indexes.PandasMultiIndex + + +More complex indexes built-in to Xarray: + +.. autosummary:: + :toctree: generated/ + + CFTimeIndex + indexes.RangeIndex + indexes.NDPointIndex + + +Creating indexes +---------------- +.. autosummary:: + :toctree: generated/ + + cftime_range + date_range + date_range_like + indexes.RangeIndex.arange + indexes.RangeIndex.linspace + + Universal functions =================== @@ -1571,31 +1605,39 @@ Custom Indexes ============== .. currentmodule:: xarray +Building custom indexes +----------------------- + +These classes are building blocks for more complex Indexes: + .. autosummary:: :toctree: generated/ - CFTimeIndex - indexes.RangeIndex + indexes.CoordinateTransform indexes.CoordinateTransformIndex indexes.NDPointIndex -Creating custom indexes ------------------------ -.. autosummary:: - :toctree: generated/ - - cftime_range - date_range - date_range_like - indexes.RangeIndex.arange - indexes.RangeIndex.linspace +The Index base class for building custom indexes: -Building custom indexes ------------------------ .. autosummary:: :toctree: generated/ - indexes.CoordinateTransform + Index.from_variables + Index.concat + Index.stack + Index.unstack + Index.create_variables + Index.should_add_coord_to_array + Index.to_pandas_index + Index.isel + Index.sel + Index.join + Index.reindex_like + Index.equals + Index.roll + Index.rename + Index.copy + Tutorial ======== @@ -1702,14 +1744,6 @@ Advanced API .. Missing: .. ``DataTree.set_close`` -Default, pandas-backed indexes built-in Xarray: - -.. autosummary:: - :toctree: generated/ - - indexes.PandasIndex - indexes.PandasMultiIndex - These backends provide a low-level interface for lazily loading data from external file-formats or protocols, and can be manually invoked to create arguments for the ``load_store`` and ``dump_to_store`` Dataset methods: diff --git a/xarray/core/coordinate_transform.py b/xarray/core/coordinate_transform.py index d1e434c3d64..02cbbc11caa 100644 --- a/xarray/core/coordinate_transform.py +++ b/xarray/core/coordinate_transform.py @@ -9,8 +9,9 @@ class CoordinateTransform: """Abstract coordinate transform with dimension & coordinate names. - EXPERIMENTAL (not ready for public use yet). - + .. caution:: + This API is experimental and subject to change. Please report any bugs or surprising + behaviour you encounter. """ coord_names: tuple[Hashable, ...] diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0bfb0b7ab1c..73b0eb19a64 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6915,7 +6915,7 @@ def groupby( :ref:`groupby` Users guide explanation of how to group and bin data. - :doc:`xarray-tutorial:intermediate/01-high-level-computation-patterns` + :doc:`xarray-tutorial:intermediate/computation/01-high-level-computation-patterns` Tutorial on :py:func:`~xarray.DataArray.Groupby` for windowed computation :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ac4bfc32df5..0f2dd266129 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -9955,7 +9955,7 @@ def groupby( :ref:`groupby` Users guide explanation of how to group and bin data. - :doc:`xarray-tutorial:intermediate/01-high-level-computation-patterns` + :doc:`xarray-tutorial:intermediate/computation/01-high-level-computation-patterns` Tutorial on :py:func:`~xarray.Dataset.Groupby` for windowed computation. :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 9684f371e00..c233c6911e4 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -1455,14 +1455,15 @@ def rename(self, name_dict, dims_dict): class CoordinateTransformIndex(Index): """Helper class for creating Xarray indexes based on coordinate transforms. - EXPERIMENTAL (not ready for public use yet). - - wraps a :py:class:`CoordinateTransform` instance - takes care of creating the index (lazy) coordinates - supports point-wise label-based selection - supports exact alignment only, by comparing indexes based on their transform (not on their explicit coordinate labels) + .. caution:: + This API is experimental and subject to change. Please report any bugs or surprising + behaviour you encounter. """ transform: CoordinateTransform From 5f3907f5a4f08e2355dd3c87bc3da03d6f529870 Mon Sep 17 00:00:00 2001 From: Ian Hunt-Isaak Date: Wed, 9 Jul 2025 20:32:39 -0400 Subject: [PATCH 18/29] Updates for Zarr 3 Dtypes (#10456) Co-authored-by: Deepak Cherian --- xarray/tests/__init__.py | 14 ++++++++++++++ xarray/tests/test_backends.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fe76df75fa0..4de9e422761 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,6 +130,20 @@ def _importorskip( has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") +has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.1.0") +if has_zarr_v3: + import zarr + + # manual update by checking attrs for now + # TODO: use version specifier + # installing from git main is giving me a lower version than the + # most recently released zarr + has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") + + requires_zarr_v3_dtypes = pytest.mark.skipif( + not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" + ) + has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a9063c4dcc9..32ebe52c6ea 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -73,6 +73,7 @@ has_scipy, has_zarr, has_zarr_v3, + has_zarr_v3_dtypes, mock, network, requires_cftime, @@ -2437,7 +2438,7 @@ def test_read_non_consolidated_warning(self) -> None: def test_non_existent_store(self) -> None: with pytest.raises( FileNotFoundError, - match="(No such file or directory|Unable to find group|No group found)", + match="(No such file or directory|Unable to find group|No group found in store)", ): xr.open_zarr(f"{uuid.uuid4()}") @@ -2519,6 +2520,7 @@ def test_manual_chunk(self) -> None: assert_identical(actual.load(), auto.load()) @requires_dask + @pytest.mark.filterwarnings("ignore:.*does not have a Zarr V3 specification.*") def test_warning_on_bad_chunks(self) -> None: original = create_test_data().chunk({"dim1": 4, "dim2": 3, "dim3": 3}) @@ -2927,7 +2929,9 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: - skip_if_zarr_format_3("This actually works fine with Zarr format 3") + if has_zarr_v3 and not has_zarr_v3_dtypes: + skip_if_zarr_format_3("This actually works fine with Zarr format 3") + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) @@ -2940,8 +2944,12 @@ def test_append_string_length_mismatch_raises(self, dtype) -> None: def test_append_string_length_mismatch_works(self, dtype) -> None: skip_if_zarr_format_2("This doesn't work with Zarr format 2") # ...but it probably would if we used object dtype + if has_zarr_v3_dtypes: + pytest.skip("This works on pre ZDtype Zarr-Python, but fails after.") + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) expected = xr.concat([ds, ds_to_append], dim="time") + with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) From 1424972aec3d345a77daea182f6ca0239cb3497d Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 9 Jul 2025 18:23:08 -0700 Subject: [PATCH 19/29] Add release notes for v2025.07.1 (#10520) --- doc/api.rst | 1 + doc/whats-new.rst | 28 ++++++++++++++-------------- xarray/indexes/__init__.py | 3 ++- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index f4a6dc6677d..b46d807e8d4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1616,6 +1616,7 @@ These classes are building blocks for more complex Indexes: indexes.CoordinateTransform indexes.CoordinateTransformIndex indexes.NDPointIndex + indexes.TreeAdapter The Index base class for building custom indexes: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f97f12ffc9f..b00215ad74e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -7,22 +7,25 @@ What's New .. _whats-new.2025.07.1: -v2025.07.1 (unreleased) ------------------------ +v2025.07.1 (July 09, 2025) +-------------------------- + +This release brings a lot of improvements to flexible indexes functionality, including new classes +to ease building of new indexes with custom coordinate transforms (:py:class:`indexes.CoordinateTransformIndex`) +and tree-like index structures (:py:class:`indexes.NDPointIndex`). +See a `new gallery `_ showing off the possibilities enabled by flexible indexes. + +Thanks to the 7 contributors to this release: +Benoit Bovy, Deepak Cherian, Dhruva Kumar Kaushal, Dimitri Papadopoulos Orfanos, Illviljan, Justus Magin and Tom Nicholas New Features ~~~~~~~~~~~~ +- New :py:class:`xarray.indexes.NDPointIndex`, which by default uses :py:class:`scipy.spatial.KDTree` under the hood for + the selection of irregular, n-dimensional data (:pull:`10478`). + By `Benoit Bovy `_. - Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). By `Benoit Bovy `_ and `Justus Magin `_. -Breaking changes -~~~~~~~~~~~~~~~~ - - -Deprecations -~~~~~~~~~~~~ - - Bug fixes ~~~~~~~~~ @@ -35,7 +38,7 @@ Bug fixes Documentation ~~~~~~~~~~~~~ - +- A `new gallery `_ showing off the possibilities enabled by flexible indexes. Internal Changes ~~~~~~~~~~~~~~~~ @@ -63,9 +66,6 @@ New Features - Expose :py:class:`~xarray.indexes.RangeIndex`, and :py:class:`~xarray.indexes.CoordinateTransformIndex` as public api under the ``xarray.indexes`` namespace. By `Deepak Cherian `_. -- New :py:class:`xarray.indexes.NDPointIndex`, which by default uses :py:class:`scipy.spatial.KDTree` under the hood for - the selection of irregular, n-dimensional data (:pull:`10478`). - By `Benoit Bovy `_. - Support zarr-python's new ``.supports_consolidated_metadata`` store property (:pull:`10457``). by `Tom Nicholas `_. - Better error messages when encoding data to be written to disk fails (:pull:`10464`). diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index 2cba69607f3..20aa5a75929 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -10,7 +10,7 @@ PandasIndex, PandasMultiIndex, ) -from xarray.indexes.nd_point_index import NDPointIndex +from xarray.indexes.nd_point_index import NDPointIndex, TreeAdapter from xarray.indexes.range_index import RangeIndex __all__ = [ @@ -21,4 +21,5 @@ "PandasIndex", "PandasMultiIndex", "RangeIndex", + "TreeAdapter", ] From 89c09b670b91beddca33eaa5cc17a423ef0b298a Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 10 Jul 2025 10:40:02 -0700 Subject: [PATCH 20/29] Add dev whats-new (#10522) --- doc/whats-new.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b00215ad74e..327603ba415 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -5,6 +5,35 @@ What's New ========== +.. _whats-new.2025.07.2: + +v2025.07.2 (unreleased) +----------------------- + +New Features +~~~~~~~~~~~~ + + +Breaking changes +~~~~~~~~~~~~~~~~ + + +Deprecations +~~~~~~~~~~~~ + + +Bug fixes +~~~~~~~~~ + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + .. _whats-new.2025.07.1: v2025.07.1 (July 09, 2025) From 30743945538ca2d276fc28eb221afa7bcb03978a Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Thu, 10 Jul 2025 14:51:13 -0700 Subject: [PATCH 21/29] Improve warning message and tests for timedelta decoding (#10508) * Improve warning message and tests for timedelta decoding The new warning message gives years clearer and simpler guidance on how to silence the warning. The new tests verify that `dtype` attributes on disk like `timedelta64[s]` will be decoded properly. * Update xarray/tests/test_coding_times.py Co-authored-by: Spencer Clark --------- Co-authored-by: Spencer Clark --- xarray/coding/times.py | 24 ++++++++++++------------ xarray/tests/test_coding_times.py | 30 ++++++++++++++++++++++-------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index d6567ba4c61..49a2747510a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1517,20 +1517,20 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: time_unit = self.time_unit else: if self._emit_decode_timedelta_future_warning: + var_string = f"the variable {name!r}" if name else "" emit_user_level_warning( "In a future version, xarray will not decode " - "timedelta values based on the presence of a " - "timedelta-like units attribute by default. Instead " - "it will rely on the presence of a timedelta64 dtype " - "attribute, which is now xarray's default way of " - "encoding timedelta64 values. To continue decoding " - "timedeltas based on the presence of a timedelta-like " - "units attribute, users will need to explicitly " - "opt-in by passing True or " - "CFTimedeltaCoder(decode_via_units=True) to " - "decode_timedelta. To silence this warning, set " - "decode_timedelta to True, False, or a " - "'CFTimedeltaCoder' instance.", + f"{var_string} into a timedelta64 dtype based on the " + "presence of a timedelta-like 'units' attribute by " + "default. Instead it will rely on the presence of a " + "timedelta64 'dtype' attribute, which is now xarray's " + "default way of encoding timedelta64 values.\n" + "To continue decoding into a timedelta64 dtype, either " + "set `decode_timedelta=True` when opening this " + "dataset, or add the attribute " + "`dtype='timedelta64[ns]'` to this variable on disk.\n" + "To opt-in to future behavior, set " + "`decode_timedelta=False`.", FutureWarning, ) if self.time_unit is None: diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index af29716fec0..322ff96b332 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1867,7 +1867,10 @@ def test_decode_timedelta_via_units( var = Variable(["time"], timedeltas, encoding=attrs) encoded = Variable(["time"], np.array([0, 1, 2]), attrs=attrs) if warns: - with pytest.warns(FutureWarning, match="decode_timedelta"): + with pytest.warns( + FutureWarning, + match="xarray will not decode the variable 'foo' into a timedelta64 dtype", + ): decoded = conventions.decode_cf_variable( "foo", encoded, @@ -1886,45 +1889,56 @@ def test_decode_timedelta_via_units( _DECODE_TIMEDELTA_VIA_DTYPE_TESTS = { - "default": (True, None, np.dtype("timedelta64[ns]")), - "decode_timedelta=False": (True, False, np.dtype("int64")), - "decode_timedelta=True": (True, True, np.dtype("timedelta64[ns]")), + "default": (True, None, "ns", np.dtype("timedelta64[ns]")), + "decode_timedelta=False": (True, False, "ns", np.dtype("int64")), + "decode_timedelta=True": (True, True, "ns", np.dtype("timedelta64[ns]")), + "use-original-units": (True, True, "s", np.dtype("timedelta64[s]")), "inherit-time_unit-from-decode_times": ( CFDatetimeCoder(time_unit="s"), None, + "ns", np.dtype("timedelta64[s]"), ), "set-time_unit-via-CFTimedeltaCoder-decode_times=True": ( True, CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), "set-time_unit-via-CFTimedeltaCoder-decode_times=False": ( False, CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), "override-time_unit-from-decode_times": ( CFDatetimeCoder(time_unit="ns"), CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), + "decode-different-units": ( + True, + CFTimedeltaCoder(time_unit="us"), + "s", + np.dtype("timedelta64[us]"), + ), } @pytest.mark.parametrize( - ("decode_times", "decode_timedelta", "expected_dtype"), + ("decode_times", "decode_timedelta", "original_unit", "expected_dtype"), list(_DECODE_TIMEDELTA_VIA_DTYPE_TESTS.values()), ids=list(_DECODE_TIMEDELTA_VIA_DTYPE_TESTS.keys()), ) def test_decode_timedelta_via_dtype( - decode_times, decode_timedelta, expected_dtype + decode_times, decode_timedelta, original_unit, expected_dtype ) -> None: - timedeltas = pd.timedelta_range(0, freq="D", periods=3) + timedeltas = pd.timedelta_range(0, freq="D", periods=3, unit=original_unit) # type: ignore[call-arg] encoding = {"units": "days"} var = Variable(["time"], timedeltas, encoding=encoding) encoded = conventions.encode_cf_variable(var) - assert encoded.attrs["dtype"] == "timedelta64[ns]" + assert encoded.attrs["dtype"] == f"timedelta64[{original_unit}]" assert encoded.attrs["units"] == encoding["units"] decoded = conventions.decode_cf_variable( "foo", encoded, decode_times=decode_times, decode_timedelta=decode_timedelta From d8c37a9a905c08a13067c1ce13c140cb0aaeb7d9 Mon Sep 17 00:00:00 2001 From: Nick Hodgskin <36369090+VeckoTheGecko@users.noreply.github.com> Date: Fri, 11 Jul 2025 16:53:09 +0200 Subject: [PATCH 22/29] Remove test skip (#10523) --- xarray/tests/test_backends.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 32ebe52c6ea..6997be200b1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3647,10 +3647,6 @@ def test_append(self) -> None: ) @requires_dask - @pytest.mark.skipif( - sys.version_info < (3, 11), - reason="zarr too old", - ) def test_region_write(self) -> None: ds = Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [1, 2, 3]}).chunk() with self.create_zarr_target() as store: From d79fc9d6b6ecd11fa4f45c3eacda2d7804b39b8c Mon Sep 17 00:00:00 2001 From: Miguel Jimenez Date: Fri, 11 Jul 2025 11:45:31 -0700 Subject: [PATCH 23/29] Pydap flaky tests (#10525) --- doc/whats-new.rst | 3 +++ xarray/tests/test_backends_datatree.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 327603ba415..605175e32d2 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,9 @@ Deprecations Bug fixes ~~~~~~~~~ +- Fix Pydap Datatree backend testing. Testing now compares elements of (unordered) two sets (before, lists) (:pull:`10525`). + By `Miguel Jimenez-Urias `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 518758a0efb..9e61fe6b6cb 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -495,7 +495,7 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: | Salinity (time, Z, Y, X) float32 ... """ tree = open_datatree(url, engine=self.engine) - assert list(tree.dims) == ["time", "Z", "nv"] + assert set(tree.dims) == set(["time", "Z", "nv"]) assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) assert tree["/SimpleGroup"].coords["Y"].dims == ("Y",) From 02e3ca2d0f6b6447d21c4fe20169d6c4dbd1fd36 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Sun, 13 Jul 2025 20:55:41 +0200 Subject: [PATCH 24/29] drop_vars: use emit_user_level_warning (#10528) --- xarray/core/dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 0f2dd266129..26db282c3df 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -5807,11 +5807,10 @@ def drop_vars( other_names.update(idx_other_names) if other_names: names_set |= set(other_names) - warnings.warn( + emit_user_level_warning( f"Deleting a single level of a MultiIndex is deprecated. Previously, this deleted all levels of a MultiIndex. " f"Please also drop the following variables: {other_names!r} to avoid an error in the future.", DeprecationWarning, - stacklevel=2, ) assert_no_index_corrupted(self.xindexes, names_set) From 1f161de82686123bf53c47092ea10299e1a7a73b Mon Sep 17 00:00:00 2001 From: Nick Hodgskin <36369090+VeckoTheGecko@users.noreply.github.com> Date: Mon, 14 Jul 2025 16:50:23 +0200 Subject: [PATCH 25/29] meta: Fix labeler topic-documentation (#10524) --- .github/labeler.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index 19107595753..ad750815f8f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -45,11 +45,10 @@ topic-DataTree: - xarray/core/datatree* topic-documentation: - - changed-files: - - any-glob-to-any-file: - - doc/* - - "!doc/whats-new.rst" - - doc/**/* + - all: + - changed-files: + - any-glob-to-any-file: "doc/**/*" + - all-globs-to-all-files: "!doc/whats-new.rst" topic-groupby: - changed-files: From 699d8957ec174f118108005aeb6ba99c1920167a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Brigitta=20Sip=C5=91cz?= Date: Mon, 14 Jul 2025 12:29:24 -0700 Subject: [PATCH 26/29] CI: update actions location (#10529) --- .github/workflows/hypothesis.yaml | 2 +- .github/workflows/upstream-dev-ci.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml index bf3a1be550d..2b39f129d1a 100644 --- a/.github/workflows/hypothesis.yaml +++ b/.github/workflows/hypothesis.yaml @@ -110,7 +110,7 @@ jobs: && steps.status.outcome == 'failure' && github.event_name == 'schedule' && github.repository_owner == 'pydata' - uses: xarray-contrib/issue-from-pytest-log@v1 + uses: scientific-python/issue-from-pytest-log-action@v1 with: log-path: output-${{ matrix.python-version }}-log.jsonl issue-title: "Nightly Hypothesis tests failed" diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 5e74c85e319..484f7414bba 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -92,7 +92,7 @@ jobs: && steps.status.outcome == 'failure' && github.event_name == 'schedule' && github.repository_owner == 'pydata' - uses: xarray-contrib/issue-from-pytest-log@v1 + uses: scientific-python/issue-from-pytest-log-action@v1 with: log-path: output-${{ matrix.python-version }}-log.jsonl From 57b3c175452efc44a7a555431b444abfd0db2b91 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Thu, 17 Jul 2025 06:34:57 -0400 Subject: [PATCH 27/29] Call super().__init__() in st.SearchStrategy subclasses (#10543) In the next version of `hypothesis` subclasses of `hypothesis.strategies.SearchStrategy` will be required to call `super().__init__()` in their `__init__` method (https://github.com/HypothesisWorks/hypothesis/pull/4473). This PR addresses this in the two subclasses in our codebase: `CFTimeStrategy` and `CFTimeStrategyISO8601`. Apparently this kind of subclassing is not actually part of the public API ([link](https://github.com/HypothesisWorks/hypothesis/pull/4473/files#diff-9abc0311b216f25f0b71cfff6b7043b22071d09a58cb949f6bc5022ddeaa8e7f)), so maybe we should adjust the approach here long term, but this at least gets the tests passing for now. - [x] Closes #10541 --- xarray/testing/strategies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index 84f37e5568a..13973b9f550 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -479,6 +479,7 @@ def unique_subset_of( class CFTimeStrategy(st.SearchStrategy): def __init__(self, min_value, max_value): + super().__init__() self.min_value = min_value self.max_value = max_value @@ -495,6 +496,7 @@ class CFTimeStrategyISO8601(st.SearchStrategy): def __init__(self): from xarray.tests.test_coding_times import _all_cftime_date_types + super().__init__() self.date_types = _all_cftime_date_types() self.calendars = list(self.date_types) From 57358aed84196f9b67b4def2d503fe91785bee11 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 17 Jul 2025 07:40:15 -0600 Subject: [PATCH 28/29] Fix kerchunk error in docs build (#10545) --- doc/combined.json | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/combined.json b/doc/combined.json index f37a0aa72b8..f1245b57291 100644 --- a/doc/combined.json +++ b/doc/combined.json @@ -10,9 +10,6 @@ "x/0": ["saved_on_disk.h5", 8352, 32], "y/.zarray": "{\"chunks\":[5],\"compressor\":null,\"dtype\":\" Date: Thu, 17 Jul 2025 16:02:34 +0200 Subject: [PATCH 29/29] Fix convert calendar with different dimension name (#10544) * Fix convert calendar with different dimension name * Update whats-new.rst --------- Co-authored-by: Spencer Clark --- doc/whats-new.rst | 2 ++ xarray/coding/calendar_ops.py | 2 +- xarray/tests/test_calendar_ops.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 605175e32d2..add40bb6b81 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,8 @@ Bug fixes - Fix Pydap Datatree backend testing. Testing now compares elements of (unordered) two sets (before, lists) (:pull:`10525`). By `Miguel Jimenez-Urias `_. +- Fix ``KeyError`` when passing a ``dim`` argument different from the default to ``convert_calendar`` (:pull:`10544`). + By `Eric Jansen `_. Documentation diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index 5fdd106e179..a6f0254a42d 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -213,7 +213,7 @@ def convert_calendar( out[dim] = new_times # Remove NaN that where put on invalid dates in target calendar - out = out.sel(time=out[dim].notnull()) + out = out.sel({dim: out[dim].notnull()}) if use_cftime: # Reassign times to ensure time index of output is a CFTimeIndex diff --git a/xarray/tests/test_calendar_ops.py b/xarray/tests/test_calendar_ops.py index 8dc1c2a503b..4ec45e4113b 100644 --- a/xarray/tests/test_calendar_ops.py +++ b/xarray/tests/test_calendar_ops.py @@ -239,6 +239,18 @@ def test_convert_calendar_errors(): convert_calendar(da, "standard", dim="x") +def test_convert_calendar_dimension_name(): + src = DataArray( + date_range("2004-01-01", "2004-01-31", freq="D", calendar="noleap"), + dims=("date",), + name="date", + ) + + out = convert_calendar(src, "proleptic_gregorian", dim="date") + + np.testing.assert_array_equal(src, out) + + def test_convert_calendar_same_calendar(): src = DataArray( date_range("2000-01-01", periods=12, freq="6h", use_cftime=False), pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy