Skip to content

Fix critical np.timedelta64 encoding bugs #10469

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,15 @@ Bug fixes
(:pull:`10352`). By `Spencer Clark <https://github.com/spencerkclark>`_.
- Avoid unsafe casts from float to unsigned int in CFMaskCoder (:issue:`9815`, :pull:`9964`).
By ` Elliott Sales de Andrade <https://github.com/QuLogic>`_.
- Fix attribute overwriting bug when decoding encoded
:py:class:`numpy.timedelta64` values from disk with a dtype attribute
(:issue:`10468`, :pull:`10469`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Fix default ``"_FillValue"`` dtype coercion bug when encoding
:py:class:`numpy.timedelta64` values to an on-disk format that only supports
32-bit integers (:issue:`10466`, :pull:`10469`). By `Spencer Clark
<https://github.com/spencerkclark>`_.


Performance
~~~~~~~~~~~
Expand Down
154 changes: 67 additions & 87 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -1410,6 +1410,43 @@ def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
return isinstance(dtype, str) and dtype.startswith("timedelta64")


def resolve_time_unit_from_attrs_dtype(
attrs_dtype: str, name: T_Name
) -> PDDatetimeUnitOptions:
dtype = np.dtype(attrs_dtype)
resolution, _ = np.datetime_data(dtype)
resolution = cast(NPDatetimeUnitOptions, resolution)
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
time_unit = cast(PDDatetimeUnitOptions, "s")
message = (
f"Following pandas, xarray only supports decoding to timedelta64 "
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
f"values for variable {name!r} have a resolution of "
f"{resolution!r}. Attempting to decode to a resolution of 's'. "
f"Note, depending on the encoded values, this may lead to an "
f"OverflowError. Additionally, data will not be identically round "
f"tripped; xarray will choose an encoding dtype of "
f"'timedelta64[s]' when re-encoding."
)
emit_user_level_warning(message)
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
time_unit = cast(PDDatetimeUnitOptions, "ns")
message = (
f"Following pandas, xarray only supports decoding to timedelta64 "
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
f"values for variable {name!r} have a resolution of "
f"{resolution!r}. Attempting to decode to a resolution of 'ns'. "
f"Note, depending on the encoded values, this may lead to loss of "
f"precision. Additionally, data will not be identically round "
f"tripped; xarray will choose an encoding dtype of "
f"'timedelta64[ns]' when re-encoding."
)
emit_user_level_warning(message)
else:
time_unit = cast(PDDatetimeUnitOptions, resolution)
return time_unit


class CFTimedeltaCoder(VariableCoder):
"""Coder for CF Timedelta coding.

Expand All @@ -1430,7 +1467,7 @@ class CFTimedeltaCoder(VariableCoder):

def __init__(
self,
time_unit: PDDatetimeUnitOptions = "ns",
time_unit: PDDatetimeUnitOptions | None = None,
decode_via_units: bool = True,
decode_via_dtype: bool = True,
) -> None:
Expand All @@ -1442,45 +1479,18 @@ def __init__(
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
if np.issubdtype(variable.data.dtype, np.timedelta64):
dims, data, attrs, encoding = unpack_for_encoding(variable)
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
dtype = encoding.get("dtype", None)
units = encoding.pop("units", None)
dtype = encoding.get("dtype", None)
units = encoding.pop("units", None)

# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"
# in the case of packed data we need to encode into
# float first, the correct dtype will be established
# via CFScaleOffsetCoder/CFMaskCoder
if "add_offset" in encoding or "scale_factor" in encoding:
dtype = data.dtype if data.dtype.kind == "f" else "float64"

else:
resolution, _ = np.datetime_data(variable.dtype)
dtype = np.int64
attrs_dtype = f"timedelta64[{resolution}]"
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
# Remove dtype encoding if it exists to prevent it from
# interfering downstream in NonStringCoder.
encoding.pop("dtype", None)

if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
f"Specifying 'add_offset' or 'scale_factor' is not "
f"supported when encoding the timedelta64 values of "
f"variable {name!r} with xarray's new default "
f"timedelta64 encoding approach. To encode {name!r} "
f"with xarray's previous timedelta64 encoding "
f"approach, which supports the 'add_offset' and "
f"'scale_factor' parameters, additionally set "
f"encoding['units'] to a unit of time, e.g. "
f"'seconds'. To proceed with encoding of {name!r} "
f"via xarray's new approach, remove any encoding "
f"entries for 'add_offset' or 'scale_factor'."
)
if "_FillValue" not in encoding and "missing_value" not in encoding:
encoding["_FillValue"] = np.iinfo(np.int64).min
resolution, _ = np.datetime_data(variable.dtype)
attrs_dtype = f"timedelta64[{resolution}]"
safe_setitem(attrs, "dtype", attrs_dtype, name=name)

data, units = encode_cf_timedelta(data, units, dtype)
safe_setitem(attrs, "units", units, name=name)
Expand All @@ -1499,54 +1509,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
):
dims, data, attrs, encoding = unpack_for_decoding(variable)
units = pop_to(attrs, encoding, "units")
if is_dtype_decodable and self.decode_via_dtype:
if any(
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
):
raise ValueError(
f"Decoding timedelta64 values via dtype is not "
f"supported when 'add_offset', or 'scale_factor' are "
f"present in encoding. Check the encoding parameters "
f"of variable {name!r}."
)
dtype = pop_to(attrs, encoding, "dtype", name=name)
dtype = np.dtype(dtype)
resolution, _ = np.datetime_data(dtype)
resolution = cast(NPDatetimeUnitOptions, resolution)
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
time_unit = cast(PDDatetimeUnitOptions, "s")
dtype = np.dtype("timedelta64[s]")
message = (
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values for variable {name!r} "
f"have a resolution of {resolution!r}. Attempting to "
f"decode to a resolution of 's'. Note, depending on "
f"the encoded values, this may lead to an "
f"OverflowError. Additionally, data will not be "
f"identically round tripped; xarray will choose an "
f"encoding dtype of 'timedelta64[s]' when re-encoding."
)
emit_user_level_warning(message)
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
time_unit = cast(PDDatetimeUnitOptions, "ns")
dtype = np.dtype("timedelta64[ns]")
message = (
f"Following pandas, xarray only supports decoding to "
f"timedelta64 values with a resolution of 's', 'ms', "
f"'us', or 'ns'. Encoded values for variable {name!r} "
f"have a resolution of {resolution!r}. Attempting to "
f"decode to a resolution of 'ns'. Note, depending on "
f"the encoded values, this may lead to loss of "
f"precision. Additionally, data will not be "
f"identically round tripped; xarray will choose an "
f"encoding dtype of 'timedelta64[ns]' "
f"when re-encoding."
)
emit_user_level_warning(message)
if is_dtype_decodable:
attrs_dtype = attrs.pop("dtype")
if self.time_unit is None:
time_unit = resolve_time_unit_from_attrs_dtype(attrs_dtype, name)
else:
time_unit = cast(PDDatetimeUnitOptions, resolution)
elif self.decode_via_units:
time_unit = self.time_unit
else:
if self._emit_decode_timedelta_future_warning:
emit_user_level_warning(
"In a future version, xarray will not decode "
Expand All @@ -1564,8 +1533,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
"'CFTimedeltaCoder' instance.",
FutureWarning,
)
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
time_unit = self.time_unit
if self.time_unit is None:
time_unit = cast(PDDatetimeUnitOptions, "ns")
else:
time_unit = self.time_unit

# Handle edge case that decode_via_dtype=False and
# decode_via_units=True, and timedeltas were encoded with a
# dtype attribute. We need to remove the dtype attribute
# to prevent an error during round tripping.
if has_timedelta_dtype:
attrs.pop("dtype")

dtype = np.dtype(f"timedelta64[{time_unit}]")
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
data = lazy_elemwise_func(data, transform, dtype=dtype)
return Variable(dims, data, attrs, encoding, fastpath=True)
Expand Down
11 changes: 11 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
from xarray.conventions import encode_dataset_coordinates
from xarray.core import indexing
from xarray.core.options import set_options
from xarray.core.types import PDDatetimeUnitOptions
from xarray.core.utils import module_available
from xarray.namedarray.pycompat import array_type
from xarray.tests import (
Expand Down Expand Up @@ -642,6 +643,16 @@
) as actual:
assert_identical(expected, actual)

def test_roundtrip_timedelta_data_via_dtype(
self, time_unit: PDDatetimeUnitOptions
) -> None:
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit(time_unit) # type: ignore[arg-type, unused-ignore]
expected = Dataset(
{"td": ("td", time_deltas), "td0": time_deltas[0].to_numpy()}
)
with self.roundtrip(expected) as actual:
assert_identical(expected, actual)

def test_roundtrip_float64_data(self) -> None:
expected = Dataset({"x": ("y", np.array([1.0, 2.0, np.pi], dtype="float64"))})
with self.roundtrip(expected) as actual:
Expand Down Expand Up @@ -3546,7 +3557,7 @@
patches = self.make_patches(store)
with patch.multiple(KVStore, **patches):
original.to_zarr(store)
self.check_requests(expected, patches)

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.12 all-but-dask

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.13

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 all-but-numba

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.11

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

Check failure on line 3560 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13

TestInstrumentedZarrStore.test_append AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 1, 'set': 16}) assert 8 <= 4

patches = self.make_patches(store)
# v2024.03.0: {'iter': 6, 'contains': 2, 'setitem': 5, 'getitem': 10, 'listdir': 6, 'list_prefix': 0}
Expand Down Expand Up @@ -3628,7 +3639,7 @@
patches = self.make_patches(store)
with patch.multiple(KVStore, **patches):
ds.to_zarr(store, mode="w", compute=False)
self.check_requests(expected, patches)

Check failure on line 3642 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.13

TestInstrumentedZarrStore.test_region_write AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 4, 'set': 16}) assert 8 <= 2

Check failure on line 3642 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 all-but-numba

TestInstrumentedZarrStore.test_region_write AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 4, 'set': 16}) assert 8 <= 2

Check failure on line 3642 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / macos-latest py3.11

TestInstrumentedZarrStore.test_region_write AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 4, 'set': 16}) assert 8 <= 2

Check failure on line 3642 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.11

TestInstrumentedZarrStore.test_region_write AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 4, 'set': 16}) assert 8 <= 2

Check failure on line 3642 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13

TestInstrumentedZarrStore.test_region_write AssertionError: ('get', {'get': 8, 'list_dir': 2, 'list_prefix': 4, 'set': 16}) assert 8 <= 2

# v2024.03.0: {'iter': 5, 'contains': 2, 'setitem': 1, 'getitem': 6, 'listdir': 5, 'list_prefix': 0}
# 6057128b: {'iter': 4, 'contains': 2, 'setitem': 1, 'getitem': 5, 'listdir': 4, 'list_prefix': 0}
Expand Down Expand Up @@ -5445,7 +5456,7 @@

def test_cmp_local_file(self) -> None:
with self.create_datasets() as (actual, expected):
assert_equal(actual, expected)

Check failure on line 5459 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 all-but-numba

TestPydap.test_cmp_local_file AssertionError: Left and right Dataset objects are not equal Differing data variables: L bears (i, j) <U3 72B 'ind' 'ist' 'ing' 'uis' 'hab' 'le' R bears (i, j) |S4 24B b'ind' b'ist' b'ing' b'uis' b'hab' b'le'

# global attributes should be global attributes on the dataset
assert "NC_GLOBAL" not in actual.attrs
Expand Down Expand Up @@ -5492,7 +5503,7 @@
@requires_dask
def test_dask(self) -> None:
with self.create_datasets(chunks={"j": 2}) as (actual, expected):
assert_equal(actual, expected)

Check failure on line 5506 in xarray/tests/test_backends.py

View workflow job for this annotation

GitHub Actions / ubuntu-latest py3.13 all-but-numba

TestPydap.test_dask AssertionError: Left and right Dataset objects are not equal Differing data variables: L bears (i, j) <U3 72B 'ind' 'ist' 'ing' 'uis' 'hab' 'le' R bears (i, j) |S4 24B b'ind' b'ist' b'ing' b'uis' b'hab' b'le'


@network
Expand Down
Loading
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy