Skip to content

Commit 624625b

Browse files
committed
Unify timedelta64 coding logic between the old and new approaches
Always write a dtype attribute to disk regardless of how the timedeltas were decoded.
1 parent 4a581f4 commit 624625b

File tree

5 files changed

+159
-170
lines changed

5 files changed

+159
-170
lines changed

doc/whats-new.rst

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ Bug fixes
121121
(:pull:`10352`). By `Spencer Clark <https://github.com/spencerkclark>`_.
122122
- Avoid unsafe casts from float to unsigned int in CFMaskCoder (:issue:`9815`, :pull:`9964`).
123123
By ` Elliott Sales de Andrade <https://github.com/QuLogic>`_.
124-
- Fix attribute overwriting bug when decoding literally encoded
125-
:py:class:`numpy.timedelta64` values from disk (:issue:`10468`,
126-
:pull:`10469`). By `Spencer Clark <https://github.com/spencerkclark>`_.
127-
- Fix default ``"_FillValue"`` dtype coercion bug when literally encoding
124+
- Fix attribute overwriting bug when decoding encoded
125+
:py:class:`numpy.timedelta64` values from disk with a dtype attribute
126+
(:issue:`10468`, :pull:`10469`). By `Spencer Clark
127+
<https://github.com/spencerkclark>`_.
128+
- Fix default ``"_FillValue"`` dtype coercion bug when encoding
128129
:py:class:`numpy.timedelta64` values to an on-disk format that only supports
129130
32-bit integers (:issue:`10466`, :pull:`10469`). By `Spencer Clark
130131
<https://github.com/spencerkclark>`_.

xarray/backends/netcdf3.py

Lines changed: 3 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def encode_nc3_attrs(attrs):
103103
return {k: encode_nc3_attr_value(v) for k, v in attrs.items()}
104104

105105

106-
def _maybe_prepare_times(var, name=None):
106+
def _maybe_prepare_times(var):
107107
# checks for integer-based time-like and
108108
# replaces np.iinfo(np.int64).min with _FillValue or np.nan
109109
# this keeps backwards compatibility
@@ -112,21 +112,7 @@ def _maybe_prepare_times(var, name=None):
112112
if data.dtype.kind in "iu":
113113
units = var.attrs.get("units", None)
114114
if units is not None and coding.variables._is_time_like(units):
115-
default_int64_fill_value = np.iinfo(np.int64).min
116-
default_int32_fill_value = np.iinfo(np.int32).min
117-
mask = data == default_int64_fill_value
118-
119-
if var.attrs.get("_FillValue") == default_int64_fill_value:
120-
if (data == default_int32_fill_value).any():
121-
raise ValueError(
122-
f"Could not safely coerce default int64 _FillValue "
123-
f"({default_int64_fill_value}) to the analogous int32 "
124-
f"value ({default_int32_fill_value}), since it "
125-
f"already exists as non-missing within variable "
126-
f"{name!r}. Try explicitly setting "
127-
f"encoding['_FillValue'] to another int32 value."
128-
)
129-
var.attrs["_FillValue"] = default_int32_fill_value
115+
mask = data == np.iinfo(np.int64).min
130116
if mask.any():
131117
data = np.where(mask, var.attrs.get("_FillValue", np.nan), data)
132118
return data
@@ -138,7 +124,7 @@ def encode_nc3_variable(var, name=None):
138124
coding.strings.CharacterArrayCoder(),
139125
]:
140126
var = coder.encode(var, name=name)
141-
data = _maybe_prepare_times(var, name=name)
127+
data = _maybe_prepare_times(var)
142128
data = coerce_nc3_dtype(data)
143129
attrs = encode_nc3_attrs(var.attrs)
144130
return Variable(var.dims, data, attrs, var.encoding)

xarray/coding/times.py

Lines changed: 67 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -1410,6 +1410,43 @@ def has_timedelta64_encoding_dtype(attrs_or_encoding: dict) -> bool:
14101410
return isinstance(dtype, str) and dtype.startswith("timedelta64")
14111411

14121412

1413+
def resolve_time_unit_from_attrs_dtype(
1414+
attrs_dtype: str, name: T_Name
1415+
) -> PDDatetimeUnitOptions:
1416+
dtype = np.dtype(attrs_dtype)
1417+
resolution, _ = np.datetime_data(dtype)
1418+
resolution = cast(NPDatetimeUnitOptions, resolution)
1419+
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1420+
time_unit = cast(PDDatetimeUnitOptions, "s")
1421+
message = (
1422+
f"Following pandas, xarray only supports decoding to timedelta64 "
1423+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1424+
f"values for variable {name!r} have a resolution of "
1425+
f"{resolution!r}. Attempting to decode to a resolution of 's'. "
1426+
f"Note, depending on the encoded values, this may lead to an "
1427+
f"OverflowError. Additionally, data will not be identically round "
1428+
f"tripped; xarray will choose an encoding dtype of "
1429+
f"'timedelta64[s]' when re-encoding."
1430+
)
1431+
emit_user_level_warning(message)
1432+
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1433+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1434+
message = (
1435+
f"Following pandas, xarray only supports decoding to timedelta64 "
1436+
f"values with a resolution of 's', 'ms', 'us', or 'ns'. Encoded "
1437+
f"values for variable {name!r} have a resolution of "
1438+
f"{resolution!r}. Attempting to decode to a resolution of 'ns'. "
1439+
f"Note, depending on the encoded values, this may lead to loss of "
1440+
f"precision. Additionally, data will not be identically round "
1441+
f"tripped; xarray will choose an encoding dtype of "
1442+
f"'timedelta64[ns]' when re-encoding."
1443+
)
1444+
emit_user_level_warning(message)
1445+
else:
1446+
time_unit = cast(PDDatetimeUnitOptions, resolution)
1447+
return time_unit
1448+
1449+
14131450
class CFTimedeltaCoder(VariableCoder):
14141451
"""Coder for CF Timedelta coding.
14151452
@@ -1430,7 +1467,7 @@ class CFTimedeltaCoder(VariableCoder):
14301467

14311468
def __init__(
14321469
self,
1433-
time_unit: PDDatetimeUnitOptions = "ns",
1470+
time_unit: PDDatetimeUnitOptions | None = None,
14341471
decode_via_units: bool = True,
14351472
decode_via_dtype: bool = True,
14361473
) -> None:
@@ -1442,45 +1479,18 @@ def __init__(
14421479
def encode(self, variable: Variable, name: T_Name = None) -> Variable:
14431480
if np.issubdtype(variable.data.dtype, np.timedelta64):
14441481
dims, data, attrs, encoding = unpack_for_encoding(variable)
1445-
has_timedelta_dtype = has_timedelta64_encoding_dtype(encoding)
1446-
if ("units" in encoding or "dtype" in encoding) and not has_timedelta_dtype:
1447-
dtype = encoding.get("dtype", None)
1448-
units = encoding.pop("units", None)
1482+
dtype = encoding.get("dtype", None)
1483+
units = encoding.pop("units", None)
14491484

1450-
# in the case of packed data we need to encode into
1451-
# float first, the correct dtype will be established
1452-
# via CFScaleOffsetCoder/CFMaskCoder
1453-
if "add_offset" in encoding or "scale_factor" in encoding:
1454-
dtype = data.dtype if data.dtype.kind == "f" else "float64"
1485+
# in the case of packed data we need to encode into
1486+
# float first, the correct dtype will be established
1487+
# via CFScaleOffsetCoder/CFMaskCoder
1488+
if "add_offset" in encoding or "scale_factor" in encoding:
1489+
dtype = data.dtype if data.dtype.kind == "f" else "float64"
14551490

1456-
else:
1457-
resolution, _ = np.datetime_data(variable.dtype)
1458-
dtype = np.int64
1459-
attrs_dtype = f"timedelta64[{resolution}]"
1460-
units = _numpy_dtype_to_netcdf_timeunit(variable.dtype)
1461-
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
1462-
# Remove dtype encoding if it exists to prevent it from
1463-
# interfering downstream in NonStringCoder.
1464-
encoding.pop("dtype", None)
1465-
1466-
if any(
1467-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1468-
):
1469-
raise ValueError(
1470-
f"Specifying 'add_offset' or 'scale_factor' is not "
1471-
f"supported when encoding the timedelta64 values of "
1472-
f"variable {name!r} with xarray's new default "
1473-
f"timedelta64 encoding approach. To encode {name!r} "
1474-
f"with xarray's previous timedelta64 encoding "
1475-
f"approach, which supports the 'add_offset' and "
1476-
f"'scale_factor' parameters, additionally set "
1477-
f"encoding['units'] to a unit of time, e.g. "
1478-
f"'seconds'. To proceed with encoding of {name!r} "
1479-
f"via xarray's new approach, remove any encoding "
1480-
f"entries for 'add_offset' or 'scale_factor'."
1481-
)
1482-
if "_FillValue" not in encoding and "missing_value" not in encoding:
1483-
encoding["_FillValue"] = np.iinfo(np.int64).min
1491+
resolution, _ = np.datetime_data(variable.dtype)
1492+
attrs_dtype = f"timedelta64[{resolution}]"
1493+
safe_setitem(attrs, "dtype", attrs_dtype, name=name)
14841494

14851495
data, units = encode_cf_timedelta(data, units, dtype)
14861496
safe_setitem(attrs, "units", units, name=name)
@@ -1499,57 +1509,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
14991509
):
15001510
dims, data, attrs, encoding = unpack_for_decoding(variable)
15011511
units = pop_to(attrs, encoding, "units")
1502-
if is_dtype_decodable and self.decode_via_dtype:
1503-
if any(
1504-
k in encoding for k in _INVALID_LITERAL_TIMEDELTA64_ENCODING_KEYS
1505-
):
1506-
raise ValueError(
1507-
f"Decoding timedelta64 values via dtype is not "
1508-
f"supported when 'add_offset', or 'scale_factor' are "
1509-
f"present in encoding. Check the encoding parameters "
1510-
f"of variable {name!r}."
1511-
)
1512-
# Overwrite the on-disk dtype encoding, which is numeric, with
1513-
# the dtype attribute stored on disk, which corresponds to
1514-
# a timedelta64 dtype.
1515-
encoding["dtype"] = attrs.pop("dtype")
1516-
dtype = np.dtype(encoding["dtype"])
1517-
resolution, _ = np.datetime_data(dtype)
1518-
resolution = cast(NPDatetimeUnitOptions, resolution)
1519-
if np.timedelta64(1, resolution) > np.timedelta64(1, "s"):
1520-
time_unit = cast(PDDatetimeUnitOptions, "s")
1521-
dtype = np.dtype("timedelta64[s]")
1522-
message = (
1523-
f"Following pandas, xarray only supports decoding to "
1524-
f"timedelta64 values with a resolution of 's', 'ms', "
1525-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1526-
f"have a resolution of {resolution!r}. Attempting to "
1527-
f"decode to a resolution of 's'. Note, depending on "
1528-
f"the encoded values, this may lead to an "
1529-
f"OverflowError. Additionally, data will not be "
1530-
f"identically round tripped; xarray will choose an "
1531-
f"encoding dtype of 'timedelta64[s]' when re-encoding."
1532-
)
1533-
emit_user_level_warning(message)
1534-
elif np.timedelta64(1, resolution) < np.timedelta64(1, "ns"):
1535-
time_unit = cast(PDDatetimeUnitOptions, "ns")
1536-
dtype = np.dtype("timedelta64[ns]")
1537-
message = (
1538-
f"Following pandas, xarray only supports decoding to "
1539-
f"timedelta64 values with a resolution of 's', 'ms', "
1540-
f"'us', or 'ns'. Encoded values for variable {name!r} "
1541-
f"have a resolution of {resolution!r}. Attempting to "
1542-
f"decode to a resolution of 'ns'. Note, depending on "
1543-
f"the encoded values, this may lead to loss of "
1544-
f"precision. Additionally, data will not be "
1545-
f"identically round tripped; xarray will choose an "
1546-
f"encoding dtype of 'timedelta64[ns]' "
1547-
f"when re-encoding."
1548-
)
1549-
emit_user_level_warning(message)
1512+
if is_dtype_decodable:
1513+
attrs_dtype = attrs.pop("dtype")
1514+
if self.time_unit is None:
1515+
time_unit = resolve_time_unit_from_attrs_dtype(attrs_dtype, name)
15501516
else:
1551-
time_unit = cast(PDDatetimeUnitOptions, resolution)
1552-
elif self.decode_via_units:
1517+
time_unit = self.time_unit
1518+
else:
15531519
if self._emit_decode_timedelta_future_warning:
15541520
emit_user_level_warning(
15551521
"In a future version, xarray will not decode "
@@ -1567,8 +1533,19 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
15671533
"'CFTimedeltaCoder' instance.",
15681534
FutureWarning,
15691535
)
1570-
dtype = np.dtype(f"timedelta64[{self.time_unit}]")
1571-
time_unit = self.time_unit
1536+
if self.time_unit is None:
1537+
time_unit = cast(PDDatetimeUnitOptions, "ns")
1538+
else:
1539+
time_unit = self.time_unit
1540+
1541+
# Handle edge case that decode_via_dtype=False and
1542+
# decode_via_units=True, and timedeltas were encoded with a
1543+
# dtype attribute. We need to remove the dtype attribute
1544+
# to prevent an error during round tripping.
1545+
if has_timedelta_dtype:
1546+
attrs.pop("dtype")
1547+
1548+
dtype = np.dtype(f"timedelta64[{time_unit}]")
15721549
transform = partial(decode_cf_timedelta, units=units, time_unit=time_unit)
15731550
data = lazy_elemwise_func(data, transform, dtype=dtype)
15741551
return Variable(dims, data, attrs, encoding, fastpath=True)

xarray/tests/test_backends.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from xarray.conventions import encode_dataset_coordinates
5757
from xarray.core import indexing
5858
from xarray.core.options import set_options
59+
from xarray.core.types import PDDatetimeUnitOptions
5960
from xarray.core.utils import module_available
6061
from xarray.namedarray.pycompat import array_type
6162
from xarray.tests import (
@@ -351,16 +352,6 @@ def test_dtype_coercion_error(self) -> None:
351352
with pytest.raises(ValueError, match="could not safely cast"):
352353
ds.to_netcdf(path, format=format)
353354

354-
def test_literal_timedelta_fill_value_coercion_error(self) -> None:
355-
for format in self.netcdf3_formats:
356-
timedeltas = np.array(
357-
[0, np.iinfo(np.int32).min, np.iinfo(np.int64).min]
358-
).astype("timedelta64[s]")
359-
ds = Dataset({"timedeltas": ("timedeltas", timedeltas)})
360-
with create_tmp_file(allow_cleanup_failure=False) as path:
361-
with pytest.raises(ValueError, match="_FillValue"):
362-
ds.to_netcdf(path, format=format)
363-
364355

365356
class DatasetIOBase:
366357
engine: T_NetcdfEngine | None = None
@@ -652,8 +643,10 @@ def test_roundtrip_timedelta_data(self) -> None:
652643
) as actual:
653644
assert_identical(expected, actual)
654645

655-
def test_roundtrip_literal_timedelta_data(self) -> None:
656-
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore]
646+
def test_roundtrip_timedelta_data_via_dtype(
647+
self, time_unit: PDDatetimeUnitOptions
648+
) -> None:
649+
time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit(time_unit) # type: ignore[arg-type, unused-ignore]
657650
expected = Dataset(
658651
{"td": ("td", time_deltas), "td0": time_deltas[0].to_numpy()}
659652
)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy