diff --git a/.github/labeler.yml b/.github/labeler.yml index 19107595753..ad750815f8f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -45,11 +45,10 @@ topic-DataTree: - xarray/core/datatree* topic-documentation: - - changed-files: - - any-glob-to-any-file: - - doc/* - - "!doc/whats-new.rst" - - doc/**/* + - all: + - changed-files: + - any-glob-to-any-file: "doc/**/*" + - all-globs-to-all-files: "!doc/whats-new.rst" topic-groupby: - changed-files: diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index e8d411ec927..b1c1a0828aa 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest env: ASV_DIR: "./asv_bench" - CONDA_ENV_FILE: ci/requirements/environment.yml + CONDA_ENV_FILE: ci/requirements/environment-benchmark.yml steps: # We need the full repo to avoid this issue @@ -29,7 +29,7 @@ jobs: with: micromamba-version: "1.5.10-0" environment-file: ${{env.CONDA_ENV_FILE}} - environment-name: xarray-tests + environment-name: xarray-benchmark cache-environment: true cache-environment-key: "${{runner.os}}-${{runner.arch}}-py${{env.PYTHON_VERSION}}-${{env.TODAY}}-${{hashFiles(env.CONDA_ENV_FILE)}}-benchmark" # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385 diff --git a/.github/workflows/hypothesis.yaml b/.github/workflows/hypothesis.yaml index bf3a1be550d..2b39f129d1a 100644 --- a/.github/workflows/hypothesis.yaml +++ b/.github/workflows/hypothesis.yaml @@ -110,7 +110,7 @@ jobs: && steps.status.outcome == 'failure' && github.event_name == 'schedule' && github.repository_owner == 'pydata' - uses: xarray-contrib/issue-from-pytest-log@v1 + uses: scientific-python/issue-from-pytest-log-action@v1 with: log-path: output-${{ matrix.python-version }}-log.jsonl issue-title: "Nightly Hypothesis tests failed" diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index 5e74c85e319..484f7414bba 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -92,7 +92,7 @@ jobs: && steps.status.outcome == 'failure' && github.event_name == 'schedule' && github.repository_owner == 'pydata' - uses: xarray-contrib/issue-from-pytest-log@v1 + uses: scientific-python/issue-from-pytest-log-action@v1 with: log-path: output-${{ matrix.python-version }}-log.jsonl diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e7d5a8567c7..eef1cc97da2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,24 +24,24 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.12.1 + rev: v0.12.2 hooks: - - id: ruff-format - - id: ruff + - id: ruff-check args: ["--fix", "--show-fixes"] + - id: ruff-format - repo: https://github.com/keewis/blackdoc - rev: v0.3.9 + rev: v0.4.1 hooks: - id: blackdoc exclude: "generate_aggregations.py" additional_dependencies: ["black==24.8.0"] - repo: https://github.com/rbubley/mirrors-prettier - rev: v3.5.3 + rev: v3.6.2 hooks: - id: prettier args: [--cache-location=.prettier_cache/cache] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.16.0 + rev: v1.16.1 hooks: - id: mypy # Copied from setup.cfg diff --git a/HOW_TO_RELEASE.md b/HOW_TO_RELEASE.md index e775d63871d..786ef8f2f18 100644 --- a/HOW_TO_RELEASE.md +++ b/HOW_TO_RELEASE.md @@ -52,6 +52,7 @@ upstream https://github.com/pydata/xarray (push) 6. After merging, again ensure your main branch is synced to upstream: ```sh + git switch main git pull upstream main ``` 7. If you have any doubts, run the full test suite one final time! @@ -98,17 +99,17 @@ upstream https://github.com/pydata/xarray (push) ``` -12. Commit your changes and push to main again: +12. Make a PR with these changes and merge it: ```sh - git commit -am 'New whatsnew section' - git push upstream main + git checkout -b empty-whatsnew-YYYY.MM.X+1 + git commit -am "empty whatsnew" + git push ``` - You're done pushing to main! + (Note that repo branch restrictions prevent pushing to `main`, so you have to just-self-merge this.) 13. Update the version available on pyodide: - - Open the PyPI page for [Xarray downloads](https://pypi.org/project/xarray/#files) - Edit [`pyodide/packages/xarray/meta.yaml`](https://github.com/pyodide/pyodide/blob/main/packages/xarray/meta.yaml) to update the - version number @@ -119,7 +120,6 @@ upstream https://github.com/pydata/xarray (push) 14. Issue the release announcement to mailing lists & Twitter (X). For bug fix releases, I usually only email xarray@googlegroups.com. For major/feature releases, I will email a broader list (no more than once every 3-6 months): - - pydata@googlegroups.com - xarray@googlegroups.com - numpy-discussion@scipy.org diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 20c873540de..b377542e402 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -60,7 +60,7 @@ // }, "matrix": { "setuptools_scm": [""], // GH6609 - "numpy": [""], + "numpy": ["2.2"], "pandas": [""], "netcdf4": [""], "scipy": [""], diff --git a/asv_bench/benchmarks/README_CI.md b/asv_bench/benchmarks/README_CI.md index 9c35e8a93b2..8461b5cd548 100644 --- a/asv_bench/benchmarks/README_CI.md +++ b/asv_bench/benchmarks/README_CI.md @@ -115,8 +115,10 @@ To minimize the time required to run the full suite, we trimmed the parameter ma ```python from . import _skip_slow # this function is defined in benchmarks.__init__ + def time_something_slow(): pass + time_something.setup = _skip_slow ``` diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py index aa4b6cb7df1..68a082fcc4f 100644 --- a/asv_bench/benchmarks/repr.py +++ b/asv_bench/benchmarks/repr.py @@ -57,3 +57,31 @@ def time_repr(self): def time_repr_html(self): self.da._repr_html_() + + +class ReprPandasRangeIndex: + # display a memory-saving pandas.RangeIndex shouldn't trigger memory + # expensive conversion into a numpy array + def setup(self): + index = xr.indexes.PandasIndex(pd.RangeIndex(1_000_000), "x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() + + +class ReprXarrayRangeIndex: + # display an Xarray RangeIndex shouldn't trigger memory expensive conversion + # of its lazy coordinate into a numpy array + def setup(self): + index = xr.indexes.RangeIndex.arange(1_000_000, dim="x") + self.ds = xr.Dataset(coords=xr.Coordinates.from_xindex(index)) + + def time_repr(self): + repr(self.ds.x) + + def time_repr_html(self): + self.ds.x._repr_html_() diff --git a/ci/requirements/environment-benchmark.yml b/ci/requirements/environment-benchmark.yml new file mode 100644 index 00000000000..0e5c7f4b489 --- /dev/null +++ b/ci/requirements/environment-benchmark.yml @@ -0,0 +1,23 @@ +name: xarray-benchmark +channels: + - conda-forge + - nodefaults +dependencies: + - bottleneck + - cftime + - dask-core + - distributed + - flox + - netcdf4 + - numba + - numbagg + - numexpr + - numpy>=2.2,<2.3 # https://github.com/numba/numba/issues/10105 + - opt_einsum + - packaging + - pandas + - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - sparse + - scipy + - toolz + - zarr diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md index 382911c18de..2a3a1cccc40 100644 --- a/design_notes/flexible_indexes_notes.md +++ b/design_notes/flexible_indexes_notes.md @@ -97,12 +97,12 @@ The new `indexes` argument of Dataset/DataArray constructors may be used to spec ```python >>> da = xr.DataArray( ... data=[[275.2, 273.5], [270.8, 278.6]], -... dims=('x', 'y'), +... dims=("x", "y"), ... coords={ -... 'lat': (('x', 'y'), [[45.6, 46.5], [50.2, 51.6]]), -... 'lon': (('x', 'y'), [[5.7, 10.5], [6.2, 12.8]]), +... "lat": (("x", "y"), [[45.6, 46.5], [50.2, 51.6]]), +... "lon": (("x", "y"), [[5.7, 10.5], [6.2, 12.8]]), ... }, -... indexes={('lat', 'lon'): SpatialIndex}, +... indexes={("lat", "lon"): SpatialIndex}, ... ) array([[275.2, 273.5], @@ -120,7 +120,7 @@ More formally, `indexes` would accept `Mapping[CoordinateNames, IndexSpec]` wher Currently index objects like `pandas.MultiIndex` can be passed directly to `coords`, which in this specific case results in the implicit creation of virtual coordinates. With the new `indexes` argument this behavior may become even more confusing than it currently is. For the sake of clarity, it would be appropriate to eventually drop support for this specific behavior and treat any given mapping value given in `coords` as an array that can be wrapped into an Xarray variable, i.e., in the case of a multi-index: ```python ->>> xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) +>>> xr.DataArray([1.0, 2.0], dims="x", coords={"x": midx}) array([1., 2.]) Coordinates: @@ -169,8 +169,8 @@ Like for the indexes, explicit coordinate creation should be preferred over impl For example, it is currently possible to pass a `pandas.MultiIndex` object as a coordinate to the Dataset/DataArray constructor: ```python ->>> midx = pd.MultiIndex.from_arrays([['a', 'b'], [0, 1]], names=['lvl1', 'lvl2']) ->>> da = xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) +>>> midx = pd.MultiIndex.from_arrays([["a", "b"], [0, 1]], names=["lvl1", "lvl2"]) +>>> da = xr.DataArray([1.0, 2.0], dims="x", coords={"x": midx}) >>> da array([1., 2.]) @@ -201,7 +201,9 @@ Besides `pandas.MultiIndex`, there may be other situations where we would like t The example given here is quite confusing, though: this is not an easily predictable behavior. We could entirely avoid the implicit creation of coordinates, e.g., using a helper function that generates coordinate + index dictionaries that we could then pass directly to the DataArray/Dataset constructor: ```python ->>> coords_dict, index_dict = create_coords_from_index(midx, dims='x', include_dim_coord=True) +>>> coords_dict, index_dict = create_coords_from_index( +... midx, dims="x", include_dim_coord=True +... ) >>> coords_dict {'x': array([('a', 0), ('b', 1)], dtype=object), @@ -211,7 +213,7 @@ The example given here is quite confusing, though: this is not an easily predict array([0, 1])} >>> index_dict {('lvl1', 'lvl2'): midx} ->>> xr.DataArray([1.0, 2.0], dims='x', coords=coords_dict, indexes=index_dict) +>>> xr.DataArray([1.0, 2.0], dims="x", coords=coords_dict, indexes=index_dict) array([1., 2.]) Coordinates: diff --git a/design_notes/grouper_objects.md b/design_notes/grouper_objects.md index ca6f099377f..f702dc17d0b 100644 --- a/design_notes/grouper_objects.md +++ b/design_notes/grouper_objects.md @@ -8,7 +8,7 @@ I propose the addition of Grouper objects to Xarray's public API so that ```python -Dataset.groupby(x=BinGrouper(bins=np.arange(10, 2)))) +Dataset.groupby(x=BinGrouper(bins=np.arange(10, 2))) ``` is identical to today's syntax: @@ -27,7 +27,7 @@ results = [] for element in unique_labels: subset = ds.sel(x=(ds.x == element)) # split # subset = ds.where(ds.x == element, drop=True) # alternative - result = subset.mean() # apply + result = subset.mean() # apply results.append(result) xr.concat(results) # combine @@ -36,7 +36,7 @@ xr.concat(results) # combine to ```python -ds.groupby('x').mean() # splits, applies, and combines +ds.groupby("x").mean() # splits, applies, and combines ``` Efficient vectorized implementations of this pattern are implemented in numpy's [`ufunc.at`](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.at.html), [`ufunc.reduceat`](https://numpy.org/doc/stable/reference/generated/numpy.ufunc.reduceat.html), [`numbagg.grouped`](https://github.com/numbagg/numbagg/blob/main/numbagg/grouped.py), [`numpy_groupies`](https://github.com/ml31415/numpy-groupies), and probably more. @@ -110,11 +110,13 @@ All Grouper objects will subclass from a Grouper object ```python import abc + class Grouper(abc.ABC): @abc.abstractmethod def factorize(self, by: DataArray): raise NotImplementedError + class CustomGrouper(Grouper): def factorize(self, by: DataArray): ... diff --git a/design_notes/named_array_design_doc.md b/design_notes/named_array_design_doc.md index 455ba72ef87..3c331c76f71 100644 --- a/design_notes/named_array_design_doc.md +++ b/design_notes/named_array_design_doc.md @@ -75,7 +75,6 @@ The named-array package is designed to be interoperable with other scientific Py - Delete the ExplicitIndexer objects (`BasicIndexer`, `VectorizedIndexer`, `OuterIndexer`) - Remove explicit support for `pd.Index`. When provided with a `pd.Index` object, Variable will coerce to an array using `np.array(pd.Index)`. For Xarray's purposes, Xarray can use `as_variable` to explicitly wrap these in PandasIndexingAdapter and pass them to `Variable.__init__`. 3. Define a minimal variable interface that the rest of Xarray can use: - 1. `dims`: tuple of dimension names 2. `data`: numpy/dask/duck arrays` 3. `attrs``: dictionary of attributes @@ -194,134 +193,132 @@ Questions: ```python # Sorting - Variable.argsort - Variable.searchsorted +Variable.argsort +Variable.searchsorted # NaN handling - Variable.fillna - Variable.isnull - Variable.notnull +Variable.fillna +Variable.isnull +Variable.notnull # Lazy data handling - Variable.chunk # Could instead have accessor interface and recommend users use `Variable.dask.chunk` and `Variable.cubed.chunk`? - Variable.to_numpy() - Variable.as_numpy() +Variable.chunk # Could instead have accessor interface and recommend users use `Variable.dask.chunk` and `Variable.cubed.chunk`? +Variable.to_numpy() +Variable.as_numpy() # Xarray-specific - Variable.get_axis_num - Variable.isel - Variable.to_dict +Variable.get_axis_num +Variable.isel +Variable.to_dict # Reductions - Variable.reduce - Variable.all - Variable.any - Variable.argmax - Variable.argmin - Variable.count - Variable.max - Variable.mean - Variable.median - Variable.min - Variable.prod - Variable.quantile - Variable.std - Variable.sum - Variable.var +Variable.reduce +Variable.all +Variable.any +Variable.argmax +Variable.argmin +Variable.count +Variable.max +Variable.mean +Variable.median +Variable.min +Variable.prod +Variable.quantile +Variable.std +Variable.sum +Variable.var # Accumulate - Variable.cumprod - Variable.cumsum +Variable.cumprod +Variable.cumsum # numpy-like Methods - Variable.astype - Variable.copy - Variable.clip - Variable.round - Variable.item - Variable.where +Variable.astype +Variable.copy +Variable.clip +Variable.round +Variable.item +Variable.where # Reordering/Reshaping - Variable.squeeze - Variable.pad - Variable.roll - Variable.shift - +Variable.squeeze +Variable.pad +Variable.roll +Variable.shift ``` #### methods to be renamed from xarray.Variable ```python # Xarray-specific - Variable.concat # create two functions, one as the equivalent of `np.stack` and other for `np.concat` +Variable.concat # create two functions, one as the equivalent of `np.stack` and other for `np.concat` - # Given how niche these are, these would be better as functions than methods. - # We could also keep these in Xarray, at least for now. If we don't think people will use functionality outside of Xarray it probably is not worth the trouble of porting it (including documentation, etc). - Variable.coarsen # This should probably be called something like coarsen_reduce. - Variable.coarsen_reshape - Variable.rolling_window +# Given how niche these are, these would be better as functions than methods. +# We could also keep these in Xarray, at least for now. If we don't think people will use functionality outside of Xarray it probably is not worth the trouble of porting it (including documentation, etc). +Variable.coarsen # This should probably be called something like coarsen_reduce. +Variable.coarsen_reshape +Variable.rolling_window - Variable.set_dims # split this into broadcast_to and expand_dims +Variable.set_dims # split this into broadcast_to and expand_dims # Reordering/Reshaping - Variable.stack # To avoid confusion with np.stack, let's call this stack_dims. - Variable.transpose # Could consider calling this permute_dims, like the [array API standard](https://data-apis.org/array-api/2022.12/API_specification/manipulation_functions.html#objects-in-api) - Variable.unstack # Likewise, maybe call this unstack_dims? +Variable.stack # To avoid confusion with np.stack, let's call this stack_dims. +Variable.transpose # Could consider calling this permute_dims, like the [array API standard](https://data-apis.org/array-api/2022.12/API_specification/manipulation_functions.html#objects-in-api) +Variable.unstack # Likewise, maybe call this unstack_dims? ``` #### methods to be removed from xarray.Variable ```python # Testing - Variable.broadcast_equals - Variable.equals - Variable.identical - Variable.no_conflicts +Variable.broadcast_equals +Variable.equals +Variable.identical +Variable.no_conflicts # Lazy data handling - Variable.compute # We can probably omit this method for now, too, given that dask.compute() uses a protocol. The other concern is that different array libraries have different notions of "compute" and this one is rather Dask specific, including conversion from Dask to NumPy arrays. For example, in JAX every operation executes eagerly, but in a non-blocking fashion, and you need to call jax.block_until_ready() to ensure computation is finished. - Variable.load # Could remove? compute vs load is a common source of confusion. +Variable.compute # We can probably omit this method for now, too, given that dask.compute() uses a protocol. The other concern is that different array libraries have different notions of "compute" and this one is rather Dask specific, including conversion from Dask to NumPy arrays. For example, in JAX every operation executes eagerly, but in a non-blocking fashion, and you need to call jax.block_until_ready() to ensure computation is finished. +Variable.load # Could remove? compute vs load is a common source of confusion. # Xarray-specific - Variable.to_index - Variable.to_index_variable - Variable.to_variable - Variable.to_base_variable - Variable.to_coord +Variable.to_index +Variable.to_index_variable +Variable.to_variable +Variable.to_base_variable +Variable.to_coord - Variable.rank # Uses bottleneck. Delete? Could use https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html instead +Variable.rank # Uses bottleneck. Delete? Could use https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.rankdata.html instead # numpy-like Methods - Variable.conjugate # .conj is enough - Variable.__array_wrap__ # This is a very old NumPy protocol for duck arrays. We don't need it now that we have `__array_ufunc__` and `__array_function__` +Variable.conjugate # .conj is enough +Variable.__array_wrap__ # This is a very old NumPy protocol for duck arrays. We don't need it now that we have `__array_ufunc__` and `__array_function__` # Encoding - Variable.reset_encoding - +Variable.reset_encoding ``` #### Attributes to be preserved from xarray.Variable ```python # Properties - Variable.attrs - Variable.chunks - Variable.data - Variable.dims - Variable.dtype - - Variable.nbytes - Variable.ndim - Variable.shape - Variable.size - Variable.sizes - - Variable.T - Variable.real - Variable.imag - Variable.conj +Variable.attrs +Variable.chunks +Variable.data +Variable.dims +Variable.dtype + +Variable.nbytes +Variable.ndim +Variable.shape +Variable.size +Variable.sizes + +Variable.T +Variable.real +Variable.imag +Variable.conj ``` #### Attributes to be renamed from xarray.Variable @@ -333,12 +330,10 @@ Questions: #### Attributes to be removed from xarray.Variable ```python - - Variable.values # Probably also remove -- this is a legacy from before Xarray supported dask arrays. ".data" is enough. +Variable.values # Probably also remove -- this is a legacy from before Xarray supported dask arrays. ".data" is enough. # Encoding - Variable.encoding - +Variable.encoding ``` ### Appendix: Implementation Details @@ -347,17 +342,16 @@ Questions: ```python class VariableArithmetic( - ImplementsArrayReduce, - IncludeReduceMethods, - IncludeCumMethods, - IncludeNumpySameMethods, - SupportsArithmetic, - VariableOpsMixin, + ImplementsArrayReduce, + IncludeReduceMethods, + IncludeCumMethods, + IncludeNumpySameMethods, + SupportsArithmetic, + VariableOpsMixin, ): - __slots__ = () - # prioritize our operations over those of numpy.ndarray (priority=0) - __array_priority__ = 50 - + __slots__ = () + # prioritize our operations over those of numpy.ndarray (priority=0) + __array_priority__ = 50 ``` - Move over `_typed_ops.VariableOpsMixin` @@ -369,7 +363,6 @@ class VariableArithmetic( - The Variable constructor will need to be rewritten to no longer accept tuples, encodings, etc. These details should be handled at the Xarray data structure level. - What happens to `duck_array_ops?` - What about Variable.chunk and "chunk managers"? - - Could this functionality be left in Xarray proper for now? Alternative array types like JAX also have some notion of "chunks" for parallel arrays, but the details differ in a number of ways from the Dask/Cubed. - Perhaps variable.chunk/load methods should become functions defined in xarray that convert Variable objects. This is easy so long as xarray can reach in and replace .data diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 9a6037cf3c4..5b9fa70d6b7 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -515,22 +515,6 @@ CFTimeIndex.values CFTimeIndex.year - Index.from_variables - Index.concat - Index.stack - Index.unstack - Index.create_variables - Index.should_add_coord_to_array - Index.to_pandas_index - Index.isel - Index.sel - Index.join - Index.reindex_like - Index.equals - Index.roll - Index.rename - Index.copy - indexes.RangeIndex.start indexes.RangeIndex.stop indexes.RangeIndex.step diff --git a/doc/api.rst b/doc/api.rst index df6e87c0cf8..b46d807e8d4 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -978,6 +978,40 @@ and DataTree objects, respectively. core.coordinates.DataArrayCoordinates core.coordinates.DataTreeCoordinates +Indexes +======= + +Default, pandas-backed indexes built-in to Xarray: + +.. autosummary:: + :toctree: generated/ + + indexes.PandasIndex + indexes.PandasMultiIndex + + +More complex indexes built-in to Xarray: + +.. autosummary:: + :toctree: generated/ + + CFTimeIndex + indexes.RangeIndex + indexes.NDPointIndex + + +Creating indexes +---------------- +.. autosummary:: + :toctree: generated/ + + cftime_range + date_range + date_range_like + indexes.RangeIndex.arange + indexes.RangeIndex.linspace + + Universal functions =================== @@ -1571,30 +1605,40 @@ Custom Indexes ============== .. currentmodule:: xarray -.. autosummary:: - :toctree: generated/ +Building custom indexes +----------------------- - CFTimeIndex - indexes.RangeIndex - indexes.CoordinateTransformIndex +These classes are building blocks for more complex Indexes: -Creating custom indexes ------------------------ .. autosummary:: :toctree: generated/ - cftime_range - date_range - date_range_like - indexes.RangeIndex.arange - indexes.RangeIndex.linspace + indexes.CoordinateTransform + indexes.CoordinateTransformIndex + indexes.NDPointIndex + indexes.TreeAdapter + +The Index base class for building custom indexes: -Building custom indexes ------------------------ .. autosummary:: :toctree: generated/ - indexes.CoordinateTransform + Index.from_variables + Index.concat + Index.stack + Index.unstack + Index.create_variables + Index.should_add_coord_to_array + Index.to_pandas_index + Index.isel + Index.sel + Index.join + Index.reindex_like + Index.equals + Index.roll + Index.rename + Index.copy + Tutorial ======== @@ -1701,11 +1745,6 @@ Advanced API .. Missing: .. ``DataTree.set_close`` -Default, pandas-backed indexes built-in Xarray: - - indexes.PandasIndex - indexes.PandasMultiIndex - These backends provide a low-level interface for lazily loading data from external file-formats or protocols, and can be manually invoked to create arguments for the ``load_store`` and ``dump_to_store`` Dataset methods: diff --git a/doc/combined.json b/doc/combined.json index f37a0aa72b8..f1245b57291 100644 --- a/doc/combined.json +++ b/doc/combined.json @@ -10,9 +10,6 @@ "x/0": ["saved_on_disk.h5", 8352, 32], "y/.zarray": "{\"chunks\":[5],\"compressor\":null,\"dtype\":\"`_. +- Fix ``KeyError`` when passing a ``dim`` argument different from the default to ``convert_calendar`` (:pull:`10544`). + By `Eric Jansen `_. + + +Documentation +~~~~~~~~~~~~~ + + +Internal Changes +~~~~~~~~~~~~~~~~ + + +.. _whats-new.2025.07.1: + +v2025.07.1 (July 09, 2025) +-------------------------- + +This release brings a lot of improvements to flexible indexes functionality, including new classes +to ease building of new indexes with custom coordinate transforms (:py:class:`indexes.CoordinateTransformIndex`) +and tree-like index structures (:py:class:`indexes.NDPointIndex`). +See a `new gallery `_ showing off the possibilities enabled by flexible indexes. + +Thanks to the 7 contributors to this release: +Benoit Bovy, Deepak Cherian, Dhruva Kumar Kaushal, Dimitri Papadopoulos Orfanos, Illviljan, Justus Magin and Tom Nicholas + +New Features +~~~~~~~~~~~~ +- New :py:class:`xarray.indexes.NDPointIndex`, which by default uses :py:class:`scipy.spatial.KDTree` under the hood for + the selection of irregular, n-dimensional data (:pull:`10478`). + By `Benoit Bovy `_. +- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`). + By `Benoit Bovy `_ and `Justus Magin `_. + +Bug fixes +~~~~~~~~~ + +- :py:meth:`Dataset.set_xindex` now raises a helpful error when a custom index + creates extra variables that don't match the provided coordinate names, instead + of silently ignoring them. The error message suggests using the factory method + pattern with :py:meth:`xarray.Coordinates.from_xindex` and + :py:meth:`Dataset.assign_coords` for advanced use cases (:issue:`10499`). + By `Dhruva Kumar Kaushal `_. + +Documentation +~~~~~~~~~~~~~ +- A `new gallery `_ showing off the possibilities enabled by flexible indexes. + +Internal Changes +~~~~~~~~~~~~~~~~ + +- Refactored the ``PandasIndexingAdapter`` and + ``CoordinateTransformIndexingAdapter`` internal indexing classes. Coordinate + variables that wrap a :py:class:`pandas.RangeIndex`, a + :py:class:`pandas.MultiIndex` or a + :py:class:`xarray.indexes.CoordinateTransform` are now displayed as lazy variables + in the Xarray data reprs (:pull:`10355`). + By `Benoit Bovy `_. + .. _whats-new.2025.07.0: v2025.07.0 (Jul 3, 2025) diff --git a/pyproject.toml b/pyproject.toml index 8cfbb6851b3..5e5fd00328b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,6 +262,7 @@ extend-select = [ "PIE", # flake8-pie "TID", # flake8-tidy-imports (absolute imports) "PYI", # flake8-pyi + "SIM", # flake8-simplify "FLY", # flynt "I", # isort "PERF", # Perflint @@ -283,6 +284,11 @@ ignore = [ "PIE790", # unnecessary pass statement "PYI019", # use `Self` instead of custom TypeVar "PYI041", # use `float` instead of `int | float` + "SIM102", # use a single `if` statement instead of nested `if` statements + "SIM108", # use ternary operator instead of `if`-`else`-block + "SIM117", # use a single `with` statement instead of nested `with` statements + "SIM118", # use `key in dict` instead of `key in dict.keys()` + "SIM300", # yoda condition detected "PERF203", # try-except within a loop incurs performance overhead "E402", # module level import not at top of file "E731", # do not assign a lambda expression, use a def diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..cfd3ff7fc0f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -36,6 +36,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing +from xarray.core.coordinates import Coordinates from xarray.core.dataarray import DataArray from xarray.core.dataset import Dataset from xarray.core.datatree import DataTree @@ -379,6 +380,15 @@ def _chunk_ds( return backend_ds._replace(variables) +def _maybe_create_default_indexes(ds): + to_index = { + name: coord.variable + for name, coord in ds.coords.items() + if coord.dims == (name,) and name not in ds.xindexes + } + return ds.assign_coords(Coordinates(to_index)) + + def _dataset_from_backend_dataset( backend_ds, filename_or_obj, @@ -389,6 +399,7 @@ def _dataset_from_backend_dataset( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -397,11 +408,15 @@ def _dataset_from_backend_dataset( ) _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds + + if create_default_indexes: + ds = _maybe_create_default_indexes(backend_ds) else: + ds = backend_ds + + if chunks is not None: ds = _chunk_ds( - backend_ds, + ds, filename_or_obj, engine, chunks, @@ -434,6 +449,7 @@ def _datatree_from_backend_datatree( inline_array, chunked_array_type, from_array_kwargs, + create_default_indexes, **extra_tokens, ): if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}: @@ -442,9 +458,11 @@ def _datatree_from_backend_datatree( ) _protect_datatree_variables_inplace(backend_tree, cache) - if chunks is None: - tree = backend_tree + if create_default_indexes: + tree = backend_tree.map_over_datasets(_maybe_create_default_indexes) else: + tree = backend_tree + if chunks is not None: tree = DataTree.from_dict( { path: _chunk_ds( @@ -459,11 +477,12 @@ def _datatree_from_backend_datatree( node=path, **extra_tokens, ) - for path, [node] in group_subtrees(backend_tree) + for path, [node] in group_subtrees(tree) }, - name=backend_tree.name, + name=tree.name, ) + if create_default_indexes or chunks is not None: for path, [node] in group_subtrees(backend_tree): tree[path].set_close(node._close) @@ -497,6 +516,7 @@ def open_dataset( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -610,6 +630,13 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -702,6 +729,7 @@ def open_dataset( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -725,6 +753,7 @@ def open_dataarray( concat_characters: bool | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -833,6 +862,13 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -890,6 +926,7 @@ def open_dataarray( chunks=chunks, cache=cache, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, inline_array=inline_array, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, @@ -946,6 +983,7 @@ def open_datatree( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1055,6 +1093,13 @@ def open_datatree( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1148,6 +1193,7 @@ def open_datatree( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) @@ -1175,6 +1221,7 @@ def open_groups( concat_characters: bool | Mapping[str, bool] | None = None, decode_coords: Literal["coordinates", "all"] | bool | None = None, drop_variables: str | Iterable[str] | None = None, + create_default_indexes: bool = True, inline_array: bool = False, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, @@ -1286,6 +1333,13 @@ def open_groups( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. inline_array: bool, default: False How to include the array in the dask task graph. By default(``inline_array=False``) the array is included in a task by @@ -1381,6 +1435,7 @@ def open_groups( chunked_array_type, from_array_kwargs, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, **decoders, **kwargs, ) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index f478c2b882c..8b56c8a2bf9 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -560,11 +560,10 @@ def _infer_dtype(array, name=None): native_dtypes = set(np.vectorize(type, otypes=[object])(array.ravel())) if len(native_dtypes) > 1 and native_dtypes != {bytes, str}: + native_dtype_names = ", ".join(x.__name__ for x in native_dtypes) raise ValueError( - "unable to infer dtype on variable {!r}; object array " - "contains mixed native types: {}".format( - name, ", ".join(x.__name__ for x in native_dtypes) - ) + f"unable to infer dtype on variable {name!r}; object array " + f"contains mixed native types: {native_dtype_names}" ) element = array[(0,) * array.ndim] diff --git a/xarray/backends/store.py b/xarray/backends/store.py index b1b3956ca8e..de52aa193ed 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -9,6 +9,7 @@ AbstractDataStore, BackendEntrypoint, ) +from xarray.core.coordinates import Coordinates from xarray.core.dataset import Dataset if TYPE_CHECKING: @@ -36,6 +37,7 @@ def open_dataset( concat_characters=True, decode_coords=True, drop_variables: str | Iterable[str] | None = None, + set_indexes: bool = True, use_cftime=None, decode_timedelta=None, ) -> Dataset: @@ -56,8 +58,19 @@ def open_dataset( decode_timedelta=decode_timedelta, ) - ds = Dataset(vars, attrs=attrs) - ds = ds.set_coords(coord_names.intersection(vars)) + # split data and coordinate variables (promote dimension coordinates) + data_vars = {} + coord_vars = {} + for name, var in vars.items(): + if name in coord_names or var.dims == (name,): + coord_vars[name] = var + else: + data_vars[name] = var + + # explicit Coordinates object with no index passed + coords = Coordinates(coord_vars, indexes={}) + + ds = Dataset(data_vars, coords=coords, attrs=attrs) ds.set_close(filename_or_obj.close) ds.encoding = encoding diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 48405b906cd..8b26a6b40ec 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1347,6 +1347,7 @@ def open_zarr( use_zarr_fill_value_as_mask=None, chunked_array_type: str | None = None, from_array_kwargs: dict[str, Any] | None = None, + create_default_indexes=True, **kwargs, ): """Load and decode a dataset from a Zarr store. @@ -1457,6 +1458,13 @@ def open_zarr( chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg. Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to :py:func:`dask.array.from_array`. Experimental API that should not be relied upon. + create_default_indexes : bool, default: True + If True, create pandas indexes for :term:`dimension coordinates `, + which loads the coordinate data into memory. Set it to False if you want to avoid loading + data into memory. + + Note that backends can still choose to create other indexes. If you want to control that, + please refer to the backend's documentation. Returns ------- @@ -1513,6 +1521,7 @@ def open_zarr( engine="zarr", chunks=chunks, drop_variables=drop_variables, + create_default_indexes=create_default_indexes, chunked_array_type=chunked_array_type, from_array_kwargs=from_array_kwargs, backend_kwargs=backend_kwargs, diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index 5fdd106e179..a6f0254a42d 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -213,7 +213,7 @@ def convert_calendar( out[dim] = new_times # Remove NaN that where put on invalid dates in target calendar - out = out.sel(time=out[dim].notnull()) + out = out.sel({dim: out[dim].notnull()}) if use_cftime: # Reassign times to ensure time index of output is a CFTimeIndex diff --git a/xarray/coding/times.py b/xarray/coding/times.py index d6567ba4c61..49a2747510a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1517,20 +1517,20 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: time_unit = self.time_unit else: if self._emit_decode_timedelta_future_warning: + var_string = f"the variable {name!r}" if name else "" emit_user_level_warning( "In a future version, xarray will not decode " - "timedelta values based on the presence of a " - "timedelta-like units attribute by default. Instead " - "it will rely on the presence of a timedelta64 dtype " - "attribute, which is now xarray's default way of " - "encoding timedelta64 values. To continue decoding " - "timedeltas based on the presence of a timedelta-like " - "units attribute, users will need to explicitly " - "opt-in by passing True or " - "CFTimedeltaCoder(decode_via_units=True) to " - "decode_timedelta. To silence this warning, set " - "decode_timedelta to True, False, or a " - "'CFTimedeltaCoder' instance.", + f"{var_string} into a timedelta64 dtype based on the " + "presence of a timedelta-like 'units' attribute by " + "default. Instead it will rely on the presence of a " + "timedelta64 'dtype' attribute, which is now xarray's " + "default way of encoding timedelta64 values.\n" + "To continue decoding into a timedelta64 dtype, either " + "set `decode_timedelta=True` when opening this " + "dataset, or add the attribute " + "`dtype='timedelta64[ns]'` to this variable on disk.\n" + "To opt-in to future behavior, set " + "`decode_timedelta=False`.", FutureWarning, ) if self.time_unit is None: diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 3b7be898ccf..eff08c74500 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -164,10 +164,8 @@ def _check_fill_values(attrs, name, dtype): Issue SerializationWarning if appropriate. """ raw_fill_dict = {} - [ + for attr in ("missing_value", "_FillValue"): pop_to(attrs, raw_fill_dict, attr, name=name) - for attr in ("missing_value", "_FillValue") - ] encoded_fill_values = set() for k in list(raw_fill_dict): v = raw_fill_dict[k] @@ -376,11 +374,9 @@ def decode(self, variable: Variable, name: T_Name = None): dims, data, attrs, encoding = unpack_for_decoding(variable) - # Even if _Unsigned is use, retain on-disk _FillValue - [ + # Even if _Unsigned is used, retain on-disk _FillValue + for attr, value in raw_fill_dict.items(): safe_setitem(encoding, attr, value, name=name) - for attr, value in raw_fill_dict.items() - ] if "_Unsigned" in attrs: unsigned = pop_to(attrs, encoding, "_Unsigned") diff --git a/xarray/computation/apply_ufunc.py b/xarray/computation/apply_ufunc.py index 678c702f3f3..00a06e12d63 100644 --- a/xarray/computation/apply_ufunc.py +++ b/xarray/computation/apply_ufunc.py @@ -141,8 +141,13 @@ def __repr__(self): return f"{type(self).__name__}({list(self.input_core_dims)!r}, {list(self.output_core_dims)!r})" def __str__(self): - lhs = ",".join("({})".format(",".join(dims)) for dims in self.input_core_dims) - rhs = ",".join("({})".format(",".join(dims)) for dims in self.output_core_dims) + comma_separated = ",".join + lhs = comma_separated( + f"({comma_separated(dims)})" for dims in self.input_core_dims + ) + rhs = comma_separated( + f"({comma_separated(dims)})" for dims in self.output_core_dims + ) return f"{lhs}->{rhs}" def to_gufunc_string(self, exclude_dims=frozenset()): diff --git a/xarray/computation/rolling.py b/xarray/computation/rolling.py index e7718560559..adb8a5e6380 100644 --- a/xarray/computation/rolling.py +++ b/xarray/computation/rolling.py @@ -132,7 +132,7 @@ def __repr__(self) -> str: """provide a nice str repr of our rolling object""" attrs = ",".join( - "{k}->{v}{c}".format(k=k, v=w, c="(center)" if c else "") + f"{k}->{w}{'(center)' if c else ''}" for k, w, c in zip(self.dim, self.window, self.center, strict=True) ) return f"{self.__class__.__name__} [{attrs}]" diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 0bab92963a5..f16dbe02f32 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -662,10 +662,11 @@ def format( """ args = tuple(self._stringify(x) for x in args) kwargs = {key: self._stringify(val) for key, val in kwargs.items()} - func = lambda x, *args, **kwargs: self._obj.dtype.type.format( - x, *args, **kwargs + return self._apply( + func=self._obj.dtype.type.format, + func_args=args, + func_kwargs={"kwargs": kwargs}, ) - return self._apply(func=func, func_args=args, func_kwargs={"kwargs": kwargs}) def capitalize(self) -> T_DataArray: """ diff --git a/xarray/core/coordinate_transform.py b/xarray/core/coordinate_transform.py index 94b3b109e1e..02cbbc11caa 100644 --- a/xarray/core/coordinate_transform.py +++ b/xarray/core/coordinate_transform.py @@ -9,8 +9,9 @@ class CoordinateTransform: """Abstract coordinate transform with dimension & coordinate names. - EXPERIMENTAL (not ready for public use yet). - + .. caution:: + This API is experimental and subject to change. Please report any bugs or surprising + behaviour you encounter. """ coord_names: tuple[Hashable, ...] @@ -80,7 +81,7 @@ def equals(self, other: CoordinateTransform, **kwargs) -> bool: Parameters ---------- other : CoordinateTransform - The other Index object to compare with this object. + The other CoordinateTransform object to compare with this object. exclude : frozenset of hashable, optional Dimensions excluded from checking. It is None by default, (i.e., when this method is not called in the context of alignment). For a diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 0bfb0b7ab1c..73b0eb19a64 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -6915,7 +6915,7 @@ def groupby( :ref:`groupby` Users guide explanation of how to group and bin data. - :doc:`xarray-tutorial:intermediate/01-high-level-computation-patterns` + :doc:`xarray-tutorial:intermediate/computation/01-high-level-computation-patterns` Tutorial on :py:func:`~xarray.DataArray.Groupby` for windowed computation :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 6de626a159b..26db282c3df 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4940,6 +4940,20 @@ def set_xindex( if isinstance(index, PandasMultiIndex): coord_names = [index.dim] + list(coord_names) + # Check for extra variables that don't match the coordinate names + extra_vars = set(new_coord_vars) - set(coord_names) + if extra_vars: + extra_vars_str = ", ".join(f"'{name}'" for name in extra_vars) + coord_names_str = ", ".join(f"'{name}'" for name in coord_names) + raise ValueError( + f"The index created extra variables {extra_vars_str} that are not " + f"in the list of coordinates {coord_names_str}. " + f"Use a factory method pattern instead:\n" + f" index = {index_cls.__name__}.from_variables(ds, {list(coord_names)!r})\n" + f" coords = xr.Coordinates.from_xindex(index)\n" + f" ds = ds.assign_coords(coords)" + ) + variables: dict[Hashable, Variable] indexes: dict[Hashable, Index] @@ -5793,11 +5807,10 @@ def drop_vars( other_names.update(idx_other_names) if other_names: names_set |= set(other_names) - warnings.warn( + emit_user_level_warning( f"Deleting a single level of a MultiIndex is deprecated. Previously, this deleted all levels of a MultiIndex. " f"Please also drop the following variables: {other_names!r} to avoid an error in the future.", DeprecationWarning, - stacklevel=2, ) assert_no_index_corrupted(self.xindexes, names_set) @@ -9941,7 +9954,7 @@ def groupby( :ref:`groupby` Users guide explanation of how to group and bin data. - :doc:`xarray-tutorial:intermediate/01-high-level-computation-patterns` + :doc:`xarray-tutorial:intermediate/computation/01-high-level-computation-patterns` Tutorial on :py:func:`~xarray.Dataset.Groupby` for windowed computation. :doc:`xarray-tutorial:fundamentals/03.2_groupby_with_xarray` diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 14e70b0550c..3a06cf18542 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -20,7 +20,11 @@ from xarray.core.datatree_render import RenderDataTree from xarray.core.duck_array_ops import array_all, array_any, array_equiv, astype, ravel from xarray.core.extension_array import PandasExtensionArray -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ( + BasicIndexer, + ExplicitlyIndexed, + MemoryCachedArray, +) from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.treenode import group_subtrees from xarray.core.utils import is_duck_array @@ -87,6 +91,8 @@ def first_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=False) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -111,6 +117,8 @@ def last_n_items(array, n_desired): if n_desired < array.size: indexer = _get_indexer_at_least_n_items(array.shape, n_desired, from_end=True) + if isinstance(array, ExplicitlyIndexed): + indexer = BasicIndexer(indexer) array = array[indexer] # We pass variable objects in to handle indexing @@ -340,17 +348,17 @@ def summarize_variable( first_col = pretty_print(first_col, col_width) if variable.dims: - dims_str = "({}) ".format(", ".join(map(str, variable.dims))) + dims_str = ", ".join(map(str, variable.dims)) + dims_str = f"({dims_str}) " else: dims_str = "" - nbytes_str = f" {render_human_readable_nbytes(variable.nbytes)}" - front_str = f"{first_col}{dims_str}{variable.dtype}{nbytes_str} " + front_str = f"{first_col}{dims_str}{variable.dtype} {render_human_readable_nbytes(variable.nbytes)} " values_width = max_width - len(front_str) values_str = inline_variable_array_repr(variable, values_width) - return front_str + values_str + return f"{front_str}{values_str}" def summarize_attr(key, value, col_width=None): @@ -659,6 +667,7 @@ def short_array_repr(array): def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data + if isinstance(array, np.ndarray): return short_array_repr(array) elif is_duck_array(internal_data): @@ -989,9 +998,10 @@ def diff_array_repr(a, b, compat): ): summary.append(coords_diff) - if compat == "identical": - if attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat): - summary.append(attrs_diff) + if compat == "identical" and ( + attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat) + ): + summary.append(attrs_diff) return "\n".join(summary) @@ -1029,9 +1039,10 @@ def diff_dataset_repr(a, b, compat): ): summary.append(data_diff) - if compat == "identical": - if attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat): - summary.append(attrs_diff) + if compat == "identical" and ( + attrs_diff := diff_attrs_repr(a.attrs, b.attrs, compat) + ): + summary.append(attrs_diff) return "\n".join(summary) diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index 1756fb54c1b..c233c6911e4 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -768,10 +768,12 @@ def concat( if not indexes: coord_dtype = None - elif len(set(idx.coord_dtype for idx in indexes)) == 1: - coord_dtype = indexes[0].coord_dtype else: - coord_dtype = np.result_type(*[idx.coord_dtype for idx in indexes]) + indexes_coord_dtypes = {idx.coord_dtype for idx in indexes} + if len(indexes_coord_dtypes) == 1: + coord_dtype = next(iter(indexes_coord_dtypes)) + else: + coord_dtype = np.result_type(*indexes_coord_dtypes) return cls(new_pd_index, dim=dim, coord_dtype=coord_dtype) @@ -1247,7 +1249,7 @@ def create_variables( level = name dtype = self.level_coords_dtype[name] # type: ignore[index] # TODO: are Hashables ok? - var = variables.get(name, None) + var = variables.get(name) if var is not None: attrs = var.attrs encoding = var.encoding @@ -1453,14 +1455,15 @@ def rename(self, name_dict, dims_dict): class CoordinateTransformIndex(Index): """Helper class for creating Xarray indexes based on coordinate transforms. - EXPERIMENTAL (not ready for public use yet). - - wraps a :py:class:`CoordinateTransform` instance - takes care of creating the index (lazy) coordinates - supports point-wise label-based selection - supports exact alignment only, by comparing indexes based on their transform (not on their explicit coordinate labels) + .. caution:: + This API is experimental and subject to change. Please report any bugs or surprising + behaviour you encounter. """ transform: CoordinateTransform diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 35278efdeaf..8e4458fb88f 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -9,7 +9,6 @@ from contextlib import suppress from dataclasses import dataclass, field from datetime import timedelta -from html import escape from typing import TYPE_CHECKING, Any, cast, overload import numpy as np @@ -20,7 +19,6 @@ from xarray.core import duck_array_ops from xarray.core.coordinate_transform import CoordinateTransform from xarray.core.nputils import NumpyVIndexAdapter -from xarray.core.options import OPTIONS from xarray.core.types import T_Xarray from xarray.core.utils import ( NDArrayMixin, @@ -1775,10 +1773,25 @@ def __init__( else: self._dtype = np.dtype(cast(DTypeLike, dtype)) + @property + def _in_memory(self) -> bool: + # prevent costly conversion of a memory-saving pd.RangeIndex into a + # large numpy array. + return not isinstance(self.array, pd.RangeIndex) + @property def dtype(self) -> np.dtype | pd.api.extensions.ExtensionDtype: # type: ignore[override] return self._dtype + def _get_numpy_dtype(self, dtype: np.typing.DTypeLike | None = None) -> np.dtype: + if dtype is None: + if is_valid_numpy_dtype(self.dtype): + return cast(np.dtype, self.dtype) + else: + return get_valid_numpy_dtype(self.array) + else: + return np.dtype(dtype) + def __array__( self, dtype: np.typing.DTypeLike | None = None, @@ -1786,11 +1799,9 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None and is_valid_numpy_dtype(self.dtype): - dtype = cast(np.dtype, self.dtype) - else: - dtype = get_valid_numpy_dtype(self.array) + dtype = self._get_numpy_dtype(dtype) array = self.array + if isinstance(array, pd.PeriodIndex): with suppress(AttributeError): # this might not be public API @@ -1834,98 +1845,61 @@ def _convert_scalar(self, item) -> np.ndarray: # numpy fails to convert pd.Timestamp to np.datetime64[ns] item = np.asarray(item.to_datetime64()) elif self.dtype != object: - dtype = self.dtype - if pd.api.types.is_extension_array_dtype(dtype): - dtype = get_valid_numpy_dtype(self.array) - item = np.asarray(item, dtype=cast(np.dtype, dtype)) + dtype = self._get_numpy_dtype() + item = np.asarray(item, dtype=dtype) # as for numpy.ndarray indexing, we always want the result to be # a NumPy array. return to_0d_array(item) - def _prepare_key(self, key: Any | tuple[Any, ...]) -> tuple[Any, ...]: - if isinstance(key, tuple) and len(key) == 1: + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + key = indexer.tuple + + if len(key) == 1: # unpack key so it can index a pandas.Index object (pandas.Index # objects don't like tuples) (key,) = key - return key + # if multidimensional key, convert the index to numpy array and index the latter + if getattr(key, "ndim", 0) > 1: + indexable = NumpyIndexingAdapter(np.asarray(self)) + return getattr(indexable, func_name)(indexer) + + # otherwise index the pandas index then re-wrap or convert the result + result = self.array[key] - def _handle_result( - self, result: Any - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): if isinstance(result, pd.Index): return type(self)(result, dtype=self.dtype) else: return self._convert_scalar(result) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.oindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + def _oindex_get(self, indexer: OuterIndexer) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "_oindex_get") def _vindex_get( self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): + ) -> PandasIndexingAdapter | np.ndarray: _assert_not_chunked_indexer(indexer.tuple) - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable.vindex[indexer] - - result = self.array[key] - - return self._handle_result(result) + return self._index_get(indexer, "_vindex_get") def __getitem__( self, indexer: ExplicitIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - key = self._prepare_key(indexer.tuple) - - if getattr(key, "ndim", 0) > 1: # Return np-array if multidimensional - indexable = NumpyIndexingAdapter(np.asarray(self)) - return indexable[indexer] - - result = self.array[key] - - return self._handle_result(result) + ) -> PandasIndexingAdapter | np.ndarray: + return self._index_get(indexer, "__getitem__") def transpose(self, order) -> pd.Index: return self.array # self.array should be always one-dimensional + def _repr_inline_(self, max_width: int) -> str: + # we want to display values in the inline repr for lazy coordinates too + # (pd.RangeIndex and pd.MultiIndex). `format_array_flat` prevents loading + # the whole array in memory. + from xarray.core.formatting import format_array_flat + + return format_array_flat(self, max_width) + def __repr__(self) -> str: return f"{type(self).__name__}(array={self.array!r}, dtype={self.dtype!r})" @@ -1944,7 +1918,9 @@ def copy(self, deep: bool = True) -> Self: def nbytes(self) -> int: if pd.api.types.is_extension_array_dtype(self.dtype): return self.array.nbytes - return cast(np.dtype, self.dtype).itemsize * len(self.array) + + dtype = self._get_numpy_dtype() + return dtype.itemsize * len(self.array) class PandasMultiIndexingAdapter(PandasIndexingAdapter): @@ -1977,8 +1953,8 @@ def __array__( *, copy: bool | None = None, ) -> np.ndarray: - if dtype is None: - dtype = cast(np.dtype, self.dtype) + dtype = self._get_numpy_dtype(dtype) + if self.level is not None: return np.asarray( self.array.get_level_values(self.level).values, dtype=dtype @@ -1986,47 +1962,28 @@ def __array__( else: return super().__array__(dtype, copy=copy) - def _convert_scalar(self, item): + @property + def _in_memory(self) -> bool: + # The pd.MultiIndex's data is fully in memory, but it has a different + # layout than the level and dimension coordinate arrays. Marking this + # adapter class as a "lazy" array will prevent costly conversion when, + # e.g., formatting the Xarray reprs. + return False + + def _convert_scalar(self, item: Any): if isinstance(item, tuple) and self.level is not None: idx = tuple(self.array.names).index(self.level) item = item[idx] return super()._convert_scalar(item) - def _oindex_get( - self, indexer: OuterIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._oindex_get(indexer) - if isinstance(result, type(self)): - result.level = self.level - return result - - def _vindex_get( - self, indexer: VectorizedIndexer - ) -> ( - PandasIndexingAdapter - | NumpyIndexingAdapter - | np.ndarray - | np.datetime64 - | np.timedelta64 - ): - result = super()._vindex_get(indexer) + def _index_get( + self, indexer: ExplicitIndexer, func_name: str + ) -> PandasIndexingAdapter | np.ndarray: + result = super()._index_get(indexer, func_name) if isinstance(result, type(self)): result.level = self.level return result - def __getitem__(self, indexer: ExplicitIndexer): - result = super().__getitem__(indexer) - if isinstance(result, type(self)): - result.level = self.level - - return result - def __repr__(self) -> str: if self.level is None: return super().__repr__() @@ -2036,31 +1993,11 @@ def __repr__(self) -> str: ) return f"{type(self).__name__}{props}" - def _get_array_subset(self) -> np.ndarray: - # used to speed-up the repr for big multi-indexes - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - indices = np.concatenate([np.arange(0, pos), np.arange(-pos, 0)]) - subset = self[OuterIndexer((indices,))] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - from xarray.core.formatting import format_array_flat - if self.level is None: return "MultiIndex" else: - return format_array_flat(self._get_array_subset(), max_width) - - def _repr_html_(self) -> str: - from xarray.core.formatting import short_array_repr - - array_repr = short_array_repr(self._get_array_subset()) - return f"
{escape(array_repr)}
" + return super()._repr_inline_(max_width=max_width) def copy(self, deep: bool = True) -> Self: # see PandasIndexingAdapter.copy @@ -2097,6 +2034,10 @@ def dtype(self) -> np.dtype: def shape(self) -> tuple[int, ...]: return tuple(self._transform.dim_size.values()) + @property + def _in_memory(self) -> bool: + return False + def get_duck_array(self) -> np.ndarray: all_coords = self._transform.generate_coords(dims=self._dims) return np.asarray(all_coords[self._coord_name]) @@ -2157,23 +2098,9 @@ def transpose(self, order: Iterable[int]) -> Self: def __repr__(self: Any) -> str: return f"{type(self).__name__}(transform={self._transform!r})" - def _get_array_subset(self) -> np.ndarray: - threshold = max(100, OPTIONS["display_values_threshold"] + 2) - if self.size > threshold: - pos = threshold // 2 - flat_indices = np.concatenate( - [np.arange(0, pos), np.arange(self.size - pos, self.size)] - ) - subset = self.vindex[ - VectorizedIndexer(np.unravel_index(flat_indices, self.shape)) - ] - else: - subset = self - - return np.asarray(subset) - def _repr_inline_(self, max_width: int) -> str: - """Good to see some labels even for a lazy coordinate.""" + # we want to display values in the inline repr for this lazy coordinate + # `format_array_flat` prevents loading the whole array in memory. from xarray.core.formatting import format_array_flat - return format_array_flat(self._get_array_subset(), max_width) + return format_array_flat(self, max_width) diff --git a/xarray/core/resample_cftime.py b/xarray/core/resample_cftime.py index 210cea2c76a..9b636f6fc81 100644 --- a/xarray/core/resample_cftime.py +++ b/xarray/core/resample_cftime.py @@ -84,22 +84,16 @@ def __init__( self.freq = to_offset(freq) self.origin = origin - if isinstance(self.freq, MonthEnd | QuarterEnd | YearEnd): - if closed is None: - self.closed = "right" - else: - self.closed = closed - if label is None: - self.label = "right" - else: - self.label = label - # The backward resample sets ``closed`` to ``'right'`` by default - # since the last value should be considered as the edge point for - # the last bin. When origin in "end" or "end_day", the value for a - # specific ``cftime.datetime`` index stands for the resample result - # from the current ``cftime.datetime`` minus ``freq`` to the current - # ``cftime.datetime`` with a right close. - elif self.origin in ["end", "end_day"]: + if isinstance(self.freq, MonthEnd | QuarterEnd | YearEnd) or self.origin in [ + "end", + "end_day", + ]: + # The backward resample sets ``closed`` to ``'right'`` by default + # since the last value should be considered as the edge point for + # the last bin. When origin in "end" or "end_day", the value for a + # specific ``cftime.datetime`` index stands for the resample result + # from the current ``cftime.datetime`` minus ``freq`` to the current + # ``cftime.datetime`` with a right close. if closed is None: self.closed = "right" else: diff --git a/xarray/core/treenode.py b/xarray/core/treenode.py index df58f7aed6f..a38ec6b2d2b 100644 --- a/xarray/core/treenode.py +++ b/xarray/core/treenode.py @@ -752,7 +752,7 @@ def relative_to(self: NamedNode, other: NamedNode) -> str: ) this_path = NodePath(self.path) - if other.path in list(parent.path for parent in (self, *self.parents)): + if any(other.path == parent.path for parent in (self, *self.parents)): return str(this_path.relative_to(other.path)) else: common_ancestor = self.find_common_ancestor(other) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 00d97e868c4..bcc2ca4e460 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -23,6 +23,7 @@ from xarray.core.extension_array import PandasExtensionArray from xarray.core.indexing import ( BasicIndexer, + CoordinateTransformIndexingAdapter, OuterIndexer, PandasIndexingAdapter, VectorizedIndexer, @@ -403,10 +404,15 @@ def _new( return cls_(dims_, data, attrs_) @property - def _in_memory(self): + def _in_memory(self) -> bool: + if isinstance( + self._data, PandasIndexingAdapter | CoordinateTransformIndexingAdapter + ): + return self._data._in_memory + return isinstance( self._data, - np.ndarray | np.number | PandasIndexingAdapter | PandasExtensionArray, + np.ndarray | np.number | PandasExtensionArray, ) or ( isinstance(self._data, indexing.MemoryCachedArray) and isinstance(self._data.array, indexing.NumpyIndexingAdapter) diff --git a/xarray/groupers.py b/xarray/groupers.py index 3a27d725116..4424c65a94b 100644 --- a/xarray/groupers.py +++ b/xarray/groupers.py @@ -617,10 +617,10 @@ def season_to_month_tuple(seasons: Sequence[str]) -> tuple[tuple[int, ...], ...] ((12, 1, 2, 3), (9, 10, 11, 12)) """ initials = "JFMAMJJASOND" - starts = dict( - ("".join(s), i + 1) + starts = { + "".join(s): i + 1 for s, i in zip(sliding_window(2, initials + "J"), range(12), strict=True) - ) + } result: list[tuple[int, ...]] = [] for i, season in enumerate(seasons): if len(season) == 1: @@ -701,25 +701,23 @@ def find_independent_seasons(seasons: Sequence[str]) -> Sequence[SeasonsGroup]: grouped = defaultdict(list) codes = defaultdict(list) seen: set[tuple[int, ...]] = set() - idx = 0 # This is quadratic, but the number of seasons is at most 12 for i, current in enumerate(season_inds): # Start with a group if current not in seen: - grouped[idx].append(current) - codes[idx].append(i) + grouped[i].append(current) + codes[i].append(i) seen.add(current) # Loop through remaining groups, and look for overlaps for j, second in enumerate(season_inds[i:]): - if not (set(chain(*grouped[idx])) & set(second)) and second not in seen: - grouped[idx].append(second) - codes[idx].append(j + i) + if not (set(chain(*grouped[i])) & set(second)) and second not in seen: + grouped[i].append(second) + codes[i].append(j + i) seen.add(second) if len(seen) == len(seasons): break - # found all non-overlapping groups for this row, increment and start over - idx += 1 + # found all non-overlapping groups for this row start over grouped_ints = tuple(tuple(idx) for idx in grouped.values() if idx) return [ diff --git a/xarray/indexes/__init__.py b/xarray/indexes/__init__.py index c53a4b8c2ce..20aa5a75929 100644 --- a/xarray/indexes/__init__.py +++ b/xarray/indexes/__init__.py @@ -10,13 +10,16 @@ PandasIndex, PandasMultiIndex, ) +from xarray.indexes.nd_point_index import NDPointIndex, TreeAdapter from xarray.indexes.range_index import RangeIndex __all__ = [ "CoordinateTransform", "CoordinateTransformIndex", "Index", + "NDPointIndex", "PandasIndex", "PandasMultiIndex", "RangeIndex", + "TreeAdapter", ] diff --git a/xarray/indexes/nd_point_index.py b/xarray/indexes/nd_point_index.py new file mode 100644 index 00000000000..95af1dd0952 --- /dev/null +++ b/xarray/indexes/nd_point_index.py @@ -0,0 +1,398 @@ +from __future__ import annotations + +import abc +from collections.abc import Hashable, Iterable, Mapping +from typing import TYPE_CHECKING, Any, Generic, TypeVar + +import numpy as np + +from xarray.core.dataarray import DataArray +from xarray.core.indexes import Index +from xarray.core.indexing import IndexSelResult +from xarray.core.utils import is_scalar +from xarray.core.variable import Variable +from xarray.structure.alignment import broadcast + +if TYPE_CHECKING: + from scipy.spatial import KDTree + + from xarray.core.types import Self + + +class TreeAdapter(abc.ABC): + """Lightweight adapter abstract class for plugging in 3rd-party structures + like :py:class:`scipy.spatial.KDTree` or :py:class:`sklearn.neighbors.KDTree` + into :py:class:`~xarray.indexes.NDPointIndex`. + + """ + + @abc.abstractmethod + def __init__(self, points: np.ndarray, *, options: Mapping[str, Any]): + """ + Parameters + ---------- + points : ndarray of shape (n_points, n_coordinates) + Two-dimensional array of points/samples (rows) and their + corresponding coordinate labels (columns) to index. + """ + ... + + @abc.abstractmethod + def query(self, points: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + """Query points. + + Parameters + ---------- + points: ndarray of shape (n_points, n_coordinates) + Two-dimensional array of points/samples (rows) and their + corresponding coordinate labels (columns) to query. + + Returns + ------- + distances : ndarray of shape (n_points) + Distances to the nearest neighbors. + indices : ndarray of shape (n_points) + Indices of the nearest neighbors in the array of the indexed + points. + """ + ... + + def equals(self, other: Self) -> bool: + """Check equality with another TreeAdapter of the same kind. + + Parameters + ---------- + other : + The other TreeAdapter object to compare with this object. + + """ + raise NotImplementedError + + +class ScipyKDTreeAdapter(TreeAdapter): + """:py:class:`scipy.spatial.KDTree` adapter for :py:class:`~xarray.indexes.NDPointIndex`.""" + + _kdtree: KDTree + + def __init__(self, points: np.ndarray, options: Mapping[str, Any]): + from scipy.spatial import KDTree + + self._kdtree = KDTree(points, **options) + + def query(self, points: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + return self._kdtree.query(points) + + def equals(self, other: Self) -> bool: + return np.array_equal(self._kdtree.data, other._kdtree.data) + + +def get_points(coords: Iterable[Variable | Any]) -> np.ndarray: + """Re-arrange data from a sequence of xarray coordinate variables or + labels into a 2-d array of shape (n_points, n_coordinates). + + """ + data = [c.values if isinstance(c, Variable | DataArray) else c for c in coords] + return np.stack([np.ravel(d) for d in data]).T + + +T_TreeAdapter = TypeVar("T_TreeAdapter", bound=TreeAdapter) + + +class NDPointIndex(Index, Generic[T_TreeAdapter]): + """Xarray index for irregular, n-dimensional data. + + This index may be associated with a set of coordinate variables representing + the arbitrary location of data points in an n-dimensional space. All + coordinates must have the same shape and dimensions. The number of + associated coordinate variables must correspond to the number of dimensions + of the space. + + This index supports label-based selection (nearest neighbor lookup). It also + has limited support for alignment. + + By default, this index relies on :py:class:`scipy.spatial.KDTree` for fast + lookup. + + Do not use :py:meth:`~xarray.indexes.NDPointIndex.__init__` directly. Instead + use :py:meth:`xarray.Dataset.set_xindex` or + :py:meth:`xarray.DataArray.set_xindex` to create and set the index from + existing coordinates (see the example below). + + Examples + -------- + An example using a dataset with 2-dimensional coordinates. + + >>> xx = [[1.0, 2.0], [3.0, 0.0]] + >>> yy = [[11.0, 21.0], [29.0, 9.0]] + >>> ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + >>> ds + Size: 64B + Dimensions: (y: 2, x: 2) + Coordinates: + xx (y, x) float64 32B 1.0 2.0 3.0 0.0 + yy (y, x) float64 32B 11.0 21.0 29.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + + Creation of a NDPointIndex from the "xx" and "yy" coordinate variables: + + >>> ds = ds.set_xindex(("xx", "yy"), xr.indexes.NDPointIndex) + >>> ds + Size: 64B + Dimensions: (y: 2, x: 2) + Coordinates: + * xx (y, x) float64 32B 1.0 2.0 3.0 0.0 + * yy (y, x) float64 32B 11.0 21.0 29.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + Indexes: + ┌ xx NDPointIndex (ScipyKDTreeAdapter) + └ yy + + Point-wise (nearest-neighbor) data selection using Xarray's advanced + indexing, i.e., using arbitrary dimension(s) for the Variable objects passed + as labels: + + >>> ds.sel( + ... xx=xr.Variable("points", [1.9, 0.1]), + ... yy=xr.Variable("points", [13.0, 8.0]), + ... method="nearest", + ... ) + Size: 32B + Dimensions: (points: 2) + Coordinates: + xx (points) float64 16B 1.0 0.0 + yy (points) float64 16B 11.0 9.0 + Dimensions without coordinates: points + Data variables: + *empty* + + Data selection with scalar labels: + + >>> ds.sel(xx=1.9, yy=13.0, method="nearest") + Size: 16B + Dimensions: () + Coordinates: + xx float64 8B 1.0 + yy float64 8B 11.0 + Data variables: + *empty* + + Data selection with broadcasting the input labels: + + >>> ds.sel(xx=1.9, yy=xr.Variable("points", [13.0, 8.0]), method="nearest") + Size: 32B + Dimensions: (points: 2) + Coordinates: + xx (points) float64 16B 1.0 0.0 + yy (points) float64 16B 11.0 9.0 + Dimensions without coordinates: points + Data variables: + *empty* + + >>> da = xr.DataArray( + ... [[45.1, 53.3], [65.4, 78.2]], + ... coords={"u": [1.9, 0.1], "v": [13.0, 8.0]}, + ... dims=("u", "v"), + ... ) + >>> ds.sel(xx=da.u, yy=da.v, method="nearest") + Size: 64B + Dimensions: (u: 2, v: 2) + Coordinates: + xx (u, v) float64 32B 1.0 0.0 1.0 0.0 + yy (u, v) float64 32B 11.0 9.0 11.0 9.0 + Dimensions without coordinates: u, v + Data variables: + *empty* + + Data selection with array-like labels (implicit dimensions): + + >>> ds.sel(xx=[[1.9], [0.1]], yy=[[13.0], [8.0]], method="nearest") + Size: 32B + Dimensions: (y: 2, x: 1) + Coordinates: + xx (y, x) float64 16B 1.0 0.0 + yy (y, x) float64 16B 11.0 9.0 + Dimensions without coordinates: y, x + Data variables: + *empty* + + """ + + _tree_obj: T_TreeAdapter + _coord_names: tuple[Hashable, ...] + _dims: tuple[Hashable, ...] + _shape: tuple[int, ...] + + def __init__( + self, + tree_obj: T_TreeAdapter, + *, + coord_names: tuple[Hashable, ...], + dims: tuple[Hashable, ...], + shape: tuple[int, ...], + ): + # this constructor is "private" + assert isinstance(tree_obj, TreeAdapter) + self._tree_obj = tree_obj + + assert len(coord_names) == len(dims) == len(shape) + self._coord_names = coord_names + self._dims = dims + self._shape = shape + + @classmethod + def from_variables( + cls, + variables: Mapping[Any, Variable], + *, + options: Mapping[str, Any], + ) -> Self: + if len({var.dims for var in variables.values()}) > 1: + var_names = ",".join(vn for vn in variables) + raise ValueError( + f"variables {var_names} must all have the same dimensions and the same shape" + ) + + var0 = next(iter(variables.values())) + + if len(variables) != len(var0.dims): + raise ValueError( + f"the number of variables {len(variables)} doesn't match " + f"the number of dimensions {len(var0.dims)}" + ) + + opts = dict(options) + + tree_adapter_cls: type[T_TreeAdapter] = opts.pop("tree_adapter_cls", None) + if tree_adapter_cls is None: + tree_adapter_cls = ScipyKDTreeAdapter + + points = get_points(variables.values()) + + return cls( + tree_adapter_cls(points, options=opts), + coord_names=tuple(variables), + dims=var0.dims, + shape=var0.shape, + ) + + def create_variables( + self, variables: Mapping[Any, Variable] | None = None + ) -> dict[Any, Variable]: + if variables is not None: + for var in variables.values(): + # maybe re-sync variable dimensions with the index object + # returned by NDPointIndex.rename() + if var.dims != self._dims: + var.dims = self._dims + return dict(**variables) + else: + return {} + + def equals( + self, other: Index, *, exclude: frozenset[Hashable] | None = None + ) -> bool: + if not isinstance(other, NDPointIndex): + return False + if type(self._tree_obj) is not type(other._tree_obj): + return False + return self._tree_obj.equals(other._tree_obj) + + def _get_dim_indexers( + self, + indices: np.ndarray, + label_dims: tuple[Hashable, ...], + label_shape: tuple[int, ...], + ) -> dict[Hashable, Variable]: + """Returns dimension indexers based on the query results (indices) and + the original label dimensions and shape. + + 1. Unravel the flat indices returned from the query + 2. Reshape the unraveled indices according to indexers shapes + 3. Wrap the indices in xarray.Variable objects. + + """ + dim_indexers = {} + + u_indices = list(np.unravel_index(indices.ravel(), self._shape)) + + for dim, ind in zip(self._dims, u_indices, strict=False): + dim_indexers[dim] = Variable(label_dims, ind.reshape(label_shape)) + + return dim_indexers + + def sel( + self, labels: dict[Any, Any], method=None, tolerance=None + ) -> IndexSelResult: + if method != "nearest": + raise ValueError( + "NDPointIndex only supports selection with method='nearest'" + ) + + missing_labels = set(self._coord_names) - set(labels) + if missing_labels: + missing_labels_str = ",".join([f"{name}" for name in missing_labels]) + raise ValueError(f"missing labels for coordinate(s): {missing_labels_str}.") + + # maybe convert labels into xarray DataArray objects + xr_labels: dict[Any, DataArray] = {} + + for name, lbl in labels.items(): + if isinstance(lbl, DataArray): + xr_labels[name] = lbl + elif isinstance(lbl, Variable): + xr_labels[name] = DataArray(lbl) + elif is_scalar(lbl): + xr_labels[name] = DataArray(lbl, dims=()) + elif np.asarray(lbl).ndim == len(self._dims): + xr_labels[name] = DataArray(lbl, dims=self._dims) + else: + raise ValueError( + "invalid label value. NDPointIndex only supports advanced (point-wise) indexing " + "with the following label value kinds:\n" + "- xarray.DataArray or xarray.Variable objects\n" + "- scalar values\n" + "- unlabelled array-like objects with the same number of dimensions " + f"than the {self._coord_names} coordinate variables ({len(self._dims)})" + ) + + # broadcast xarray labels against one another and determine labels shape and dimensions + broadcasted = broadcast(*xr_labels.values()) + label_dims = broadcasted[0].dims + label_shape = broadcasted[0].shape + xr_labels = dict(zip(xr_labels, broadcasted, strict=True)) + + # get and return dimension indexers + points = get_points(xr_labels[name] for name in self._coord_names) + _, indices = self._tree_obj.query(points) + + dim_indexers = self._get_dim_indexers(indices, label_dims, label_shape) + + return IndexSelResult(dim_indexers=dim_indexers) + + def rename( + self, + name_dict: Mapping[Any, Hashable], + dims_dict: Mapping[Any, Hashable], + ) -> Self: + if not set(self._coord_names) & set(name_dict) and not set(self._dims) & set( + dims_dict + ): + return self + + new_coord_names = tuple(name_dict.get(n, n) for n in self._coord_names) + new_dims = tuple(dims_dict.get(d, d) for d in self._dims) + + return type(self)( + self._tree_obj, + coord_names=new_coord_names, + dims=new_dims, + shape=self._shape, + ) + + def _repr_inline_(self, max_width: int) -> str: + tree_obj_type = self._tree_obj.__class__.__name__ + return f"{self.__class__.__name__} ({tree_obj_type})" diff --git a/xarray/plot/facetgrid.py b/xarray/plot/facetgrid.py index e64348c7281..c5303e96274 100644 --- a/xarray/plot/facetgrid.py +++ b/xarray/plot/facetgrid.py @@ -549,7 +549,7 @@ def map_plot1d( ) if add_legend: - use_legend_elements = not func.__name__ == "hist" + use_legend_elements = func.__name__ != "hist" if use_legend_elements: self.add_legend( use_legend_elements=use_legend_elements, diff --git a/xarray/plot/utils.py b/xarray/plot/utils.py index fc4ca1532e5..a71613562a5 100644 --- a/xarray/plot/utils.py +++ b/xarray/plot/utils.py @@ -419,9 +419,10 @@ def _infer_xy_labels( _assert_valid_xy(darray, x, "x") _assert_valid_xy(darray, y, "y") - if darray._indexes.get(x, 1) is darray._indexes.get(y, 2): - if isinstance(darray._indexes[x], PandasMultiIndex): - raise ValueError("x and y cannot be levels of the same MultiIndex") + if darray._indexes.get(x, 1) is darray._indexes.get(y, 2) and isinstance( + darray._indexes[x], PandasMultiIndex + ): + raise ValueError("x and y cannot be levels of the same MultiIndex") return x, y @@ -1142,7 +1143,7 @@ def _get_color_and_size(value): # Labels are not numerical so modifying label_values is not # possible, instead filter the array with nicely distributed # indexes: - if type(num) == int: # noqa: E721 + if type(num) is int: loc = mpl.ticker.LinearLocator(num) else: raise ValueError("`num` only supports integers for non-numeric labels.") diff --git a/xarray/structure/merge.py b/xarray/structure/merge.py index 403186272b9..5c998075151 100644 --- a/xarray/structure/merge.py +++ b/xarray/structure/merge.py @@ -300,7 +300,7 @@ def merge_collected( variables = [variable for variable, _ in elements_list] try: merged_vars[name] = unique_variable( - name, variables, compat, equals.get(name, None) + name, variables, compat, equals.get(name) ) except MergeError: if compat != "minimal": diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py index 84f37e5568a..13973b9f550 100644 --- a/xarray/testing/strategies.py +++ b/xarray/testing/strategies.py @@ -479,6 +479,7 @@ def unique_subset_of( class CFTimeStrategy(st.SearchStrategy): def __init__(self, min_value, max_value): + super().__init__() self.min_value = min_value self.max_value = max_value @@ -495,6 +496,7 @@ class CFTimeStrategyISO8601(st.SearchStrategy): def __init__(self): from xarray.tests.test_coding_times import _all_cftime_date_types + super().__init__() self.date_types = _all_cftime_date_types() self.calendars = list(self.date_types) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index fe76df75fa0..4de9e422761 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -130,6 +130,20 @@ def _importorskip( has_rasterio, requires_rasterio = _importorskip("rasterio") has_zarr, requires_zarr = _importorskip("zarr") has_zarr_v3, requires_zarr_v3 = _importorskip("zarr", "3.0.0") +has_zarr_v3_dtypes, requires_zarr_v3_dtypes = _importorskip("zarr", "3.1.0") +if has_zarr_v3: + import zarr + + # manual update by checking attrs for now + # TODO: use version specifier + # installing from git main is giving me a lower version than the + # most recently released zarr + has_zarr_v3_dtypes = hasattr(zarr.core, "dtype") + + requires_zarr_v3_dtypes = pytest.mark.skipif( + not has_zarr_v3_dtypes, reason="requires zarr>3.1.0" + ) + has_fsspec, requires_fsspec = _importorskip("fsspec") has_iris, requires_iris = _importorskip("iris") has_numbagg, requires_numbagg = _importorskip("numbagg") diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..6997be200b1 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,6 +55,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing +from xarray.core.indexes import PandasIndex from xarray.core.options import set_options from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import module_available @@ -72,6 +73,7 @@ has_scipy, has_zarr, has_zarr_v3, + has_zarr_v3_dtypes, mock, network, requires_cftime, @@ -2066,6 +2068,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self): with self.roundtrip(original): pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_netCDF4 class TestNetCDF4Data(NetCDF4Base): @@ -2416,7 +2438,7 @@ def test_read_non_consolidated_warning(self) -> None: def test_non_existent_store(self) -> None: with pytest.raises( FileNotFoundError, - match="(No such file or directory|Unable to find group|No group found)", + match="(No such file or directory|Unable to find group|No group found in store)", ): xr.open_zarr(f"{uuid.uuid4()}") @@ -2498,6 +2520,7 @@ def test_manual_chunk(self) -> None: assert_identical(actual.load(), auto.load()) @requires_dask + @pytest.mark.filterwarnings("ignore:.*does not have a Zarr V3 specification.*") def test_warning_on_bad_chunks(self) -> None: original = create_test_data().chunk({"dim1": 4, "dim2": 3, "dim3": 3}) @@ -2906,7 +2929,9 @@ def test_append_with_existing_encoding_raises(self) -> None: @pytest.mark.parametrize("dtype", ["U", "S"]) def test_append_string_length_mismatch_raises(self, dtype) -> None: - skip_if_zarr_format_3("This actually works fine with Zarr format 3") + if has_zarr_v3 and not has_zarr_v3_dtypes: + skip_if_zarr_format_3("This actually works fine with Zarr format 3") + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) @@ -2919,8 +2944,12 @@ def test_append_string_length_mismatch_raises(self, dtype) -> None: def test_append_string_length_mismatch_works(self, dtype) -> None: skip_if_zarr_format_2("This doesn't work with Zarr format 2") # ...but it probably would if we used object dtype + if has_zarr_v3_dtypes: + pytest.skip("This works on pre ZDtype Zarr-Python, but fails after.") + ds, ds_to_append = create_append_string_length_mismatch_test_data(dtype) expected = xr.concat([ds, ds_to_append], dim="time") + with self.create_zarr_target() as store_target: ds.to_zarr(store_target, mode="w", **self.version_kwargs) ds_to_append.to_zarr(store_target, append_dim="time", **self.version_kwargs) @@ -3618,10 +3647,6 @@ def test_append(self) -> None: ) @requires_dask - @pytest.mark.skipif( - sys.version_info < (3, 11), - reason="zarr too old", - ) def test_region_write(self) -> None: ds = Dataset({"foo": ("x", [1, 2, 3])}, coords={"x": [1, 2, 3]}).chunk() with self.create_zarr_target() as store: @@ -4063,6 +4088,26 @@ def test_pickle(self) -> None: def test_pickle_dataarray(self) -> None: pass + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None: + store_path = tmp_path / "tmp.nc" + original_ds = xr.Dataset( + {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]} + ) + original_ds.to_netcdf(store_path, engine=self.engine, mode="w") + + with open_dataset( + store_path, + engine=self.engine, + create_default_indexes=create_default_indexes, + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + @requires_scipy class TestScipyFilePath(CFEncodedBase, NetCDF3Only): @@ -6434,6 +6479,26 @@ def test_zarr_closing_internal_zip_store(): assert_identical(original_da, loaded_da) +@requires_zarr +@pytest.mark.parametrize("create_default_indexes", [True, False]) +def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: + from xarray.core.indexes import PandasIndex + + store_path = tmp_path / "tmp.zarr" + original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}) + original_ds.to_zarr(store_path, mode="w") + + with open_dataset( + store_path, engine="zarr", create_default_indexes=create_default_indexes + ) as loaded_ds: + if create_default_indexes: + assert list(loaded_ds.xindexes) == ["x"] and isinstance( + loaded_ds.xindexes["x"], PandasIndex + ) + else: + assert len(loaded_ds.xindexes) == 0 + + @requires_zarr @pytest.mark.usefixtures("default_zarr_format") def test_raises_key_error_on_invalid_zarr_store(tmp_path): diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index 9342423b727..778e800ec67 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -201,3 +201,39 @@ def test_join_chunks(self, shape, pref_chunks, req_chunks): chunks=dict(zip(initial[self.var_name].dims, req_chunks, strict=True)), ) self.check_dataset(initial, final, explicit_chunks(req_chunks, shape)) + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes(self, create_default_indexes): + """Create default indexes if the backend does not create them.""" + coords = xr.Coordinates({"x": ("x", [0, 1]), "y": list("abc")}, indexes={}) + initial = xr.Dataset({"a": ("x", [1, 2])}, coords=coords) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + if create_default_indexes: + assert all(name in final.xindexes for name in ["x", "y"]) + else: + assert len(final.xindexes) == 0 + + @pytest.mark.parametrize("create_default_indexes", [True, False]) + def test_default_indexes_passthrough(self, create_default_indexes): + """Allow creating indexes in the backend.""" + + initial = xr.Dataset( + {"a": (["x", "y"], [[1, 2, 3], [4, 5, 6]])}, + coords={"x": ("x", [0, 1]), "y": ("y", list("abc"))}, + ).stack(z=["x", "y"]) + + with assert_no_warnings(): + final = xr.open_dataset( + initial, + engine=PassThroughBackendEntrypoint, + create_default_indexes=create_default_indexes, + ) + + assert initial.coords.equals(final.coords) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 518758a0efb..9e61fe6b6cb 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -495,7 +495,7 @@ def test_inherited_coords(self, url=simplegroup_datatree_url) -> None: | Salinity (time, Z, Y, X) float32 ... """ tree = open_datatree(url, engine=self.engine) - assert list(tree.dims) == ["time", "Z", "nv"] + assert set(tree.dims) == set(["time", "Z", "nv"]) assert tree["/SimpleGroup"].coords["time"].dims == ("time",) assert tree["/SimpleGroup"].coords["Z"].dims == ("Z",) assert tree["/SimpleGroup"].coords["Y"].dims == ("Y",) diff --git a/xarray/tests/test_calendar_ops.py b/xarray/tests/test_calendar_ops.py index 8dc1c2a503b..4ec45e4113b 100644 --- a/xarray/tests/test_calendar_ops.py +++ b/xarray/tests/test_calendar_ops.py @@ -239,6 +239,18 @@ def test_convert_calendar_errors(): convert_calendar(da, "standard", dim="x") +def test_convert_calendar_dimension_name(): + src = DataArray( + date_range("2004-01-01", "2004-01-31", freq="D", calendar="noleap"), + dims=("date",), + name="date", + ) + + out = convert_calendar(src, "proleptic_gregorian", dim="date") + + np.testing.assert_array_equal(src, out) + + def test_convert_calendar_same_calendar(): src = DataArray( date_range("2000-01-01", periods=12, freq="6h", use_cftime=False), diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index acb32504948..24ef2c8397e 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -94,8 +94,8 @@ def test_coder_roundtrip() -> None: assert_identical(original, roundtripped) -@pytest.mark.parametrize("dtype", "u1 u2 i1 i2 f2 f4".split()) -@pytest.mark.parametrize("dtype2", "f4 f8".split()) +@pytest.mark.parametrize("dtype", ["u1", "u2", "i1", "i2", "f2", "f4"]) +@pytest.mark.parametrize("dtype2", ["f4", "f8"]) def test_scaling_converts_to_float(dtype: str, dtype2: str) -> None: dt = np.dtype(dtype2) original = xr.Variable( diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index af29716fec0..322ff96b332 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1867,7 +1867,10 @@ def test_decode_timedelta_via_units( var = Variable(["time"], timedeltas, encoding=attrs) encoded = Variable(["time"], np.array([0, 1, 2]), attrs=attrs) if warns: - with pytest.warns(FutureWarning, match="decode_timedelta"): + with pytest.warns( + FutureWarning, + match="xarray will not decode the variable 'foo' into a timedelta64 dtype", + ): decoded = conventions.decode_cf_variable( "foo", encoded, @@ -1886,45 +1889,56 @@ def test_decode_timedelta_via_units( _DECODE_TIMEDELTA_VIA_DTYPE_TESTS = { - "default": (True, None, np.dtype("timedelta64[ns]")), - "decode_timedelta=False": (True, False, np.dtype("int64")), - "decode_timedelta=True": (True, True, np.dtype("timedelta64[ns]")), + "default": (True, None, "ns", np.dtype("timedelta64[ns]")), + "decode_timedelta=False": (True, False, "ns", np.dtype("int64")), + "decode_timedelta=True": (True, True, "ns", np.dtype("timedelta64[ns]")), + "use-original-units": (True, True, "s", np.dtype("timedelta64[s]")), "inherit-time_unit-from-decode_times": ( CFDatetimeCoder(time_unit="s"), None, + "ns", np.dtype("timedelta64[s]"), ), "set-time_unit-via-CFTimedeltaCoder-decode_times=True": ( True, CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), "set-time_unit-via-CFTimedeltaCoder-decode_times=False": ( False, CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), "override-time_unit-from-decode_times": ( CFDatetimeCoder(time_unit="ns"), CFTimedeltaCoder(time_unit="s"), + "ns", np.dtype("timedelta64[s]"), ), + "decode-different-units": ( + True, + CFTimedeltaCoder(time_unit="us"), + "s", + np.dtype("timedelta64[us]"), + ), } @pytest.mark.parametrize( - ("decode_times", "decode_timedelta", "expected_dtype"), + ("decode_times", "decode_timedelta", "original_unit", "expected_dtype"), list(_DECODE_TIMEDELTA_VIA_DTYPE_TESTS.values()), ids=list(_DECODE_TIMEDELTA_VIA_DTYPE_TESTS.keys()), ) def test_decode_timedelta_via_dtype( - decode_times, decode_timedelta, expected_dtype + decode_times, decode_timedelta, original_unit, expected_dtype ) -> None: - timedeltas = pd.timedelta_range(0, freq="D", periods=3) + timedeltas = pd.timedelta_range(0, freq="D", periods=3, unit=original_unit) # type: ignore[call-arg] encoding = {"units": "days"} var = Variable(["time"], timedeltas, encoding=encoding) encoded = conventions.encode_cf_variable(var) - assert encoded.attrs["dtype"] == "timedelta64[ns]" + assert encoded.attrs["dtype"] == f"timedelta64[{original_unit}]" assert encoded.attrs["units"] == encoding["units"] decoded = conventions.decode_cf_variable( "foo", encoded, decode_times=decode_times, decode_timedelta=decode_timedelta diff --git a/xarray/tests/test_coordinate_transform.py b/xarray/tests/test_coordinate_transform.py index 386ce426998..627063eb8cb 100644 --- a/xarray/tests/test_coordinate_transform.py +++ b/xarray/tests/test_coordinate_transform.py @@ -123,6 +123,17 @@ def test_coordinate_transform_variable_repr_inline() -> None: ) +def test_coordinate_transform_variable_repr() -> None: + var = create_coords(scale=2.0, shape=(2, 2))["x"].variable + + actual = repr(var) + expected = """ + Size: 32B +[4 values with dtype=float64] + """.strip() + assert actual == expected + + def test_coordinate_transform_variable_basic_outer_indexing() -> None: var = create_coords(scale=2.0, shape=(4, 4))["x"].variable diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index eefa3c2b4f8..9024f2ae677 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1129,7 +1129,8 @@ def test_unify_chunks(map_ds): def test_unify_chunks_shallow_copy(obj, transform): obj = transform(obj) unified = obj.unify_chunks() - assert_identical(obj, unified) and obj is not obj.unify_chunks() + assert_identical(obj, unified) + # assert obj is not unified @pytest.mark.parametrize("obj", [make_da()]) @@ -1636,7 +1637,7 @@ def test_normalize_token_with_backend(map_ds): with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file: map_ds.to_netcdf(tmp_file) read = xr.open_dataset(tmp_file) - assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read) + assert dask.base.tokenize(map_ds) != dask.base.tokenize(read) read.close() diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 88c2c819405..c2ab1144e7b 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -1189,3 +1189,46 @@ def test_array_repr_dtypes(): Dimensions without coordinates: x """.strip() assert actual == expected + + +def test_repr_pandas_range_index() -> None: + # lazy data repr but values shown in inline repr + xidx = xr.indexes.PandasIndex(pd.RangeIndex(10), "x") + ds = xr.Dataset(coords=xr.Coordinates.from_xindex(xidx)) + actual = repr(ds.x) + expected = """ + Size: 80B +[10 values with dtype=int64] +Coordinates: + * x (x) int64 80B 0 1 2 3 4 5 6 7 8 9 + """.strip() + assert actual == expected + + +def test_repr_pandas_multi_index() -> None: + # lazy data repr but values shown in inline repr + midx = pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=["foo", "bar"]) + coords = xr.Coordinates.from_pandas_multiindex(midx, "x") + ds = xr.Dataset(coords=coords) + + actual = repr(ds.x) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected + + actual = repr(ds.foo) + expected = """ + Size: 32B +[4 values with dtype=object] +Coordinates: + * x (x) object 32B MultiIndex + * foo (x) object 32B 'a' 'a' 'b' 'b' + * bar (x) int64 32B 1 2 1 2 + """.strip() + assert actual == expected diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 54cc21b5d2c..731574baac9 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -3235,7 +3235,7 @@ def test_shuffle_simple() -> None: da = xr.DataArray( dims="x", data=dask.array.from_array([1, 2, 3, 4, 5, 6], chunks=2), - coords={"label": ("x", "a b c a b c".split(" "))}, + coords={"label": ("x", ["a", "b", "c", "a", "b", "c"])}, ) actual = da.groupby(label=UniqueGrouper()).shuffle_to_chunks() expected = da.isel(x=[0, 3, 1, 4, 2, 5]) diff --git a/xarray/tests/test_indexes.py b/xarray/tests/test_indexes.py index 2b7900d9c89..9f2eea48260 100644 --- a/xarray/tests/test_indexes.py +++ b/xarray/tests/test_indexes.py @@ -729,3 +729,54 @@ def test_restore_dtype_on_multiindexes(dtype: str) -> None: foo = xr.Dataset(coords={"bar": ("bar", np.array([0, 1], dtype=dtype))}) foo = foo.stack(baz=("bar",)) assert str(foo["bar"].values.dtype) == dtype + + +class IndexWithExtraVariables(Index): + @classmethod + def from_variables(cls, variables, *, options=None): + return cls() + + def create_variables(self, variables=None): + if variables is None: + # For Coordinates.from_xindex(), return all variables the index can create + return { + "time": Variable(dims=("time",), data=[1, 2, 3]), + "valid_time": Variable( + dims=("time",), + data=[2, 3, 4], # time + 1 + attrs={"description": "time + 1"}, + ), + } + + result = dict(variables) + if "time" in variables: + result["valid_time"] = Variable( + dims=("time",), + data=variables["time"].data + 1, + attrs={"description": "time + 1"}, + ) + return result + + +def test_set_xindex_with_extra_variables() -> None: + """Test that set_xindex raises an error when custom index creates extra variables.""" + + ds = xr.Dataset(coords={"time": [1, 2, 3]}).reset_index("time") + + # Test that set_xindex raises error for extra variables + with pytest.raises(ValueError, match="extra variables 'valid_time'"): + ds.set_xindex("time", IndexWithExtraVariables) + + +def test_set_xindex_factory_method_pattern() -> None: + ds = xr.Dataset(coords={"time": [1, 2, 3]}).reset_index("time") + + # Test the recommended factory method pattern + coord_vars = {"time": ds._variables["time"]} + index = IndexWithExtraVariables.from_variables(coord_vars) + coords = xr.Coordinates.from_xindex(index) + result = ds.assign_coords(coords) + + assert "time" in result.variables + assert "valid_time" in result.variables + assert_array_equal(result.valid_time.data, result.time.data + 1) diff --git a/xarray/tests/test_nd_point_index.py b/xarray/tests/test_nd_point_index.py new file mode 100644 index 00000000000..eb497aa263f --- /dev/null +++ b/xarray/tests/test_nd_point_index.py @@ -0,0 +1,183 @@ +import numpy as np +import pytest + +import xarray as xr +from xarray.indexes import NDPointIndex +from xarray.tests import assert_identical + +pytest.importorskip("scipy") + + +def test_tree_index_init() -> None: + from xarray.indexes.nd_point_index import ScipyKDTreeAdapter + + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + + ds_indexed1 = ds.set_xindex(("xx", "yy"), NDPointIndex) + assert "xx" in ds_indexed1.xindexes + assert "yy" in ds_indexed1.xindexes + assert isinstance(ds_indexed1.xindexes["xx"], NDPointIndex) + assert ds_indexed1.xindexes["xx"] is ds_indexed1.xindexes["yy"] + + ds_indexed2 = ds.set_xindex( + ("xx", "yy"), NDPointIndex, tree_adapter_cls=ScipyKDTreeAdapter + ) + assert ds_indexed1.xindexes["xx"].equals(ds_indexed2.xindexes["yy"]) + + +def test_tree_index_init_errors() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}) + + with pytest.raises(ValueError, match="number of variables"): + ds.set_xindex("xx", NDPointIndex) + + ds2 = ds.assign_coords(yy=(("u", "v"), [[3.0, 3.0], [4.0, 4.0]])) + + with pytest.raises(ValueError, match="same dimensions"): + ds2.set_xindex(("xx", "yy"), NDPointIndex) + + +def test_tree_index_sel() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + # 1-dimensional labels + actual = ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1, 3.1]), + method="nearest", + ) + expected = xr.Dataset( + coords={"xx": ("u", [1.0, 1.0, 1.0]), "yy": ("u", [3.0, 3.0, 3.0])} + ) + assert_identical(actual, expected) + + # 2-dimensional labels + actual = ds.sel( + xx=xr.Variable(("u", "v"), [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + yy=xr.Variable(("u", "v"), [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + expected = xr.Dataset( + coords={ + "xx": (("u", "v"), [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]), + "yy": (("u", "v"), [[3.0, 3.0, 3.0], [4.0, 4.0, 4.0]]), + }, + ) + assert_identical(actual, expected) + + # all scalar labels + actual = ds.sel(xx=1.1, yy=3.1, method="nearest") + expected = xr.Dataset(coords={"xx": 1.0, "yy": 3.0}) + assert_identical(actual, expected) + + # broadcast scalar to label shape and dimensions + actual = ds.sel(xx=1.1, yy=xr.Variable("u", [3.1, 3.1, 3.1]), method="nearest") + expected = ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1, 3.1]), + method="nearest", + ) + assert_identical(actual, expected) + + # broadcast orthogonal 1-dimensional labels + actual = ds.sel( + xx=xr.Variable("u", [1.1, 1.1]), + yy=xr.Variable("v", [3.1, 3.1]), + method="nearest", + ) + expected = xr.Dataset( + coords={ + "xx": (("u", "v"), [[1.0, 1.0], [1.0, 1.0]]), + "yy": (("u", "v"), [[3.0, 3.0], [3.0, 3.0]]), + }, + ) + assert_identical(actual, expected) + + # implicit dimension array-like labels + actual = ds.sel( + xx=[[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]], + yy=[[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]], + method="nearest", + ) + expected = ds.sel( + xx=xr.Variable(ds.xx.dims, [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + yy=xr.Variable(ds.yy.dims, [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + assert_identical(actual, expected) + + +def test_tree_index_sel_errors() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + with pytest.raises(ValueError, match="method='nearest'"): + ds.sel(xx=1.1, yy=3.1) + + with pytest.raises(ValueError, match="missing labels"): + ds.sel(xx=1.1, method="nearest") + + with pytest.raises(ValueError, match="invalid label value"): + # invalid array-like dimensions + ds.sel(xx=[1.1, 1.9], yy=[3.1, 3.9], method="nearest") + + # error while trying to broadcast labels + with pytest.raises(xr.AlignmentError, match=".*conflicting dimension sizes"): + ds.sel( + xx=xr.Variable("u", [1.1, 1.1, 1.1]), + yy=xr.Variable("u", [3.1, 3.1]), + method="nearest", + ) + + +def test_tree_index_equals() -> None: + xx1, yy1 = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds1 = xr.Dataset( + coords={"xx": (("y", "x"), xx1), "yy": (("y", "x"), yy1)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + xx2, yy2 = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds2 = xr.Dataset( + coords={"xx": (("y", "x"), xx2), "yy": (("y", "x"), yy2)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + xx3, yy3 = np.meshgrid([10.0, 20.0], [30.0, 40.0]) + ds3 = xr.Dataset( + coords={"xx": (("y", "x"), xx3), "yy": (("y", "x"), yy3)} + ).set_xindex(("xx", "yy"), NDPointIndex) + + assert ds1.xindexes["xx"].equals(ds2.xindexes["xx"]) + assert not ds1.xindexes["xx"].equals(ds3.xindexes["xx"]) + + +def test_tree_index_rename() -> None: + xx, yy = np.meshgrid([1.0, 2.0], [3.0, 4.0]) + ds = xr.Dataset(coords={"xx": (("y", "x"), xx), "yy": (("y", "x"), yy)}).set_xindex( + ("xx", "yy"), NDPointIndex + ) + + ds_renamed = ds.rename_dims(y="u").rename_vars(yy="uu") + assert "uu" in ds_renamed.xindexes + assert isinstance(ds_renamed.xindexes["uu"], NDPointIndex) + assert ds_renamed.xindexes["xx"] is ds_renamed.xindexes["uu"] + + # test via sel() with implicit dimension array-like labels, which relies on + # NDPointIndex._coord_names and NDPointIndex._dims internal attrs + actual = ds_renamed.sel( + xx=[[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]], + uu=[[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]], + method="nearest", + ) + expected = ds_renamed.sel( + xx=xr.Variable(ds_renamed.xx.dims, [[1.1, 1.1, 1.1], [1.9, 1.9, 1.9]]), + uu=xr.Variable(ds_renamed.uu.dims, [[3.1, 3.1, 3.1], [3.9, 3.9, 3.9]]), + method="nearest", + ) + assert_identical(actual, expected) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index d98d72d9876..7635dde3a69 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -234,9 +234,7 @@ def convert_units(obj, to): elif isinstance(obj, xr.DataArray): name = obj.name - new_units = ( - to.get(name, None) or to.get("data", None) or to.get(None, None) or None - ) + new_units = to.get(name) or to.get("data") or to.get(None) or None data = convert_units(obj.variable, {None: new_units}) coords = { @@ -5637,7 +5635,7 @@ def test_duck_array_ops(self): assert_units_equal(expected, actual) # Don't use isinstance b/c we don't want to allow subclasses through - assert type(expected.data) == type(actual.data) # noqa: E721 + assert type(expected.data) is type(actual.data) @requires_matplotlib diff --git a/xarray/util/print_versions.py b/xarray/util/print_versions.py index 8f3be73ee68..0be9713c3dc 100644 --- a/xarray/util/print_versions.py +++ b/xarray/util/print_versions.py @@ -20,7 +20,7 @@ def get_sys_info(): if os.path.isdir(".git") and os.path.isdir("xarray"): try: pipe = subprocess.Popen( - 'git log --format="%H" -n 1'.split(" "), + ("git", "log", '--format="%H"', "-n", "1"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy