Skip to content

Commit a260ae9

Browse files
(fix): structured arrays for v2 (#2681)
--------- Co-authored-by: Martin Durant <martindurant@users.noreply.github.com>
1 parent e9772ac commit a260ae9

File tree

7 files changed

+56
-9
lines changed

7 files changed

+56
-9
lines changed

docs/release-notes.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ Unreleased
77
Bug fixes
88
~~~~~~~~~
99

10+
* Backwards compatibility for Zarr format 2 structured arrays (:issue:`2134`)
11+
1012
Features
1113
~~~~~~~~
1214

docs/user-guide/config.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ This is the current default configuration::
5353
'level': 0}},
5454
'v2_default_filters': {'bytes': [{'id': 'vlen-bytes'}],
5555
'numeric': None,
56+
'raw': None,
5657
'string': [{'id': 'vlen-utf8'}]},
5758
'v3_default_compressors': {'bytes': [{'configuration': {'checksum': False,
5859
'level': 0},

src/zarr/core/buffer/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -470,7 +470,9 @@ def all_equal(self, other: Any, equal_nan: bool = True) -> bool:
470470
# every single time we have to write data?
471471
_data, other = np.broadcast_arrays(self._data, other)
472472
return np.array_equal(
473-
self._data, other, equal_nan=equal_nan if self._data.dtype.kind not in "USTO" else False
473+
self._data,
474+
other,
475+
equal_nan=equal_nan if self._data.dtype.kind not in "USTOV" else False,
474476
)
475477

476478
def fill(self, value: Any) -> None:

src/zarr/core/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ def reset(self) -> None:
7575
"numeric": None,
7676
"string": [{"id": "vlen-utf8"}],
7777
"bytes": [{"id": "vlen-bytes"}],
78+
"raw": None,
7879
},
7980
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
8081
"v3_default_serializer": {

src/zarr/core/metadata/v2.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,14 @@ def to_dict(self) -> dict[str, JSON]:
193193
zarray_dict["fill_value"] = fill_value
194194

195195
_ = zarray_dict.pop("dtype")
196-
zarray_dict["dtype"] = self.dtype.str
196+
dtype_json: JSON
197+
# In the case of zarr v2, the simplest i.e., '|VXX' dtype is represented as a string
198+
dtype_descr = self.dtype.descr
199+
if self.dtype.kind == "V" and dtype_descr[0][0] != "" and len(dtype_descr) != 0:
200+
dtype_json = tuple(self.dtype.descr)
201+
else:
202+
dtype_json = self.dtype.str
203+
zarray_dict["dtype"] = dtype_json
197204

198205
return zarray_dict
199206

@@ -220,6 +227,8 @@ def update_attributes(self, attributes: dict[str, JSON]) -> Self:
220227

221228

222229
def parse_dtype(data: npt.DTypeLike) -> np.dtype[Any]:
230+
if isinstance(data, list): # this is a valid _VoidDTypeLike check
231+
data = [tuple(d) for d in data]
223232
return np.dtype(data)
224233

225234

@@ -376,8 +385,10 @@ def _default_filters(
376385
dtype_key = "numeric"
377386
elif dtype.kind in "U":
378387
dtype_key = "string"
379-
elif dtype.kind in "OSV":
388+
elif dtype.kind in "OS":
380389
dtype_key = "bytes"
390+
elif dtype.kind == "V":
391+
dtype_key = "raw"
381392
else:
382393
raise ValueError(f"Unsupported dtype kind {dtype.kind}")
383394

tests/test_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def test_config_defaults_set() -> None:
6161
"numeric": None,
6262
"string": [{"id": "vlen-utf8"}],
6363
"bytes": [{"id": "vlen-bytes"}],
64+
"raw": None,
6465
},
6566
"v3_default_filters": {"numeric": [], "string": [], "bytes": []},
6667
"v3_default_serializer": {

tests/test_v2.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,15 @@ def test_codec_pipeline() -> None:
8484
np.testing.assert_array_equal(result, expected)
8585

8686

87-
@pytest.mark.parametrize("dtype", ["|S", "|V"])
88-
async def test_v2_encode_decode(dtype):
87+
@pytest.mark.parametrize(
88+
("dtype", "expected_dtype", "fill_value", "fill_value_encoding"),
89+
[
90+
("|S", "|S0", b"X", "WA=="),
91+
("|V", "|V0", b"X", "WA=="),
92+
("|V10", "|V10", b"X", "WAAAAAAAAAAAAA=="),
93+
],
94+
)
95+
async def test_v2_encode_decode(dtype, expected_dtype, fill_value, fill_value_encoding) -> None:
8996
with config.set(
9097
{
9198
"array.v2_default_filters.bytes": [{"id": "vlen-bytes"}],
@@ -95,7 +102,7 @@ async def test_v2_encode_decode(dtype):
95102
store = zarr.storage.MemoryStore()
96103
g = zarr.group(store=store, zarr_format=2)
97104
g.create_array(
98-
name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=b"X", compressor=None
105+
name="foo", shape=(3,), chunks=(3,), dtype=dtype, fill_value=fill_value, compressor=None
99106
)
100107

101108
result = await store.get("foo/.zarray", zarr.core.buffer.default_buffer_prototype())
@@ -105,9 +112,9 @@ async def test_v2_encode_decode(dtype):
105112
expected = {
106113
"chunks": [3],
107114
"compressor": None,
108-
"dtype": f"{dtype}0",
109-
"fill_value": "WA==",
110-
"filters": [{"id": "vlen-bytes"}],
115+
"dtype": expected_dtype,
116+
"fill_value": fill_value_encoding,
117+
"filters": [{"id": "vlen-bytes"}] if dtype == "|S" else None,
111118
"order": "C",
112119
"shape": [3],
113120
"zarr_format": 2,
@@ -284,3 +291,25 @@ def test_default_filters_and_compressor(dtype_expected: Any) -> None:
284291
assert arr.metadata.compressor.codec_id == expected_compressor
285292
if expected_filter is not None:
286293
assert arr.metadata.filters[0].codec_id == expected_filter
294+
295+
296+
@pytest.mark.parametrize("fill_value", [None, (b"", 0, 0.0)], ids=["no_fill", "fill"])
297+
def test_structured_dtype_roundtrip(fill_value, tmp_path) -> None:
298+
a = np.array(
299+
[(b"aaa", 1, 4.2), (b"bbb", 2, 8.4), (b"ccc", 3, 12.6)],
300+
dtype=[("foo", "S3"), ("bar", "i4"), ("baz", "f8")],
301+
)
302+
array_path = tmp_path / "data.zarr"
303+
za = zarr.create(
304+
shape=(3,),
305+
store=array_path,
306+
chunks=(2,),
307+
fill_value=fill_value,
308+
zarr_format=2,
309+
dtype=a.dtype,
310+
)
311+
if fill_value is not None:
312+
assert (np.array([fill_value] * a.shape[0], dtype=a.dtype) == za[:]).all()
313+
za[...] = a
314+
za = zarr.open_array(store=array_path)
315+
assert (a == za[:]).all()

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy