Skip to content

[Bug]: Using structured arrays behaves differently between Zarr and HDF5 #273

Open
@rcpeene

Description

@rcpeene

What happened?

I'm generating an NWB to encode task rewards. My code is passing a numpy structured array to be added as a timeseries. I'm not sure if this is allowed or not, but nonetheless the behavior seems to differ beween Zarr and HDF5.

Steps to Reproduce

Running the following code on a Zarr NWB vs an HDF5 NWB, where rewards contains multiple arrays of equal length.

with io_class(str(result_nwb_path), "r+") as io:
    nwb_file = io.read()

    # Make a structured array with named fields
    reward_data = np.array(
        list(zip(rewards['volume'], rewards['autorewarded'].astype(bool))),
        dtype=[('volume', 'f4'), ('autorewarded', 'bool')]
    )

    timestamps = rewards['timestamps'].to_numpy()
    rewards_ts = TimeSeries(
        name='rewards_combined',
        data=reward_data,
        unit='mixed',
        timestamps=timestamps,
        description='Reward events with volume and autorewarded flag'
    )
    nwbfile.add_acquisition(rewards_ts)
    io.write(nwb_file)

Traceback

When running this on an HDF5 NWB, an error is given during file write:

  Traceback (most recent call last):
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 1439, in __list_fill__
      dset = parent.create_dataset(name, shape=data_shape, dtype=dtype, **io_settings)
    File "/opt/conda/lib/python3.9/site-packages/h5py/_hl/group.py", line 186, in create_dataset
      dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
    File "/opt/conda/lib/python3.9/site-packages/h5py/_hl/dataset.py", line 88, in make_new_dset
      tid = h5t.py_create(dtype, logical=1)
    File "h5py/h5t.pyx", line 1669, in h5py.h5t.py_create
    File "h5py/h5t.pyx", line 1693, in h5py.h5t.py_create
    File "h5py/h5t.pyx", line 1727, in h5py.h5t.py_create
    File "h5py/h5t.pyx", line 1522, in h5py.h5t._c_opaque
  ValueError: Size must be positive (size must be positive)

  The above exception was the direct cause of the following exception:

  Traceback (most recent call last):
    File "/tmp/nxf.23GfiOSmB4/capsule/code/run_capsule.py", line 226, in <module>
      if __name__ == "__main__": run()
    File "/tmp/nxf.23GfiOSmB4/capsule/code/run_capsule.py", line 222, in run
      io.write(nwb_file)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 395, in write
      super().write(**kwargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/io.py", line 99, in write
      self.write_builder(f_builder, **kwargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 834, in write_builder
      self.write_group(self.__file, gbldr, **kwargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 1013, in write_group
      self.write_group(group, sub_builder, **kwargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 1018, in write_group
      self.write_dataset(group, sub_builder, **kwargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/utils.py", line 577, in func_call
      return func(args[0], **pargs)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 1286, in write_dataset
      dset = self.__list_fill__(parent, name, data, options)
    File "/opt/conda/lib/python3.9/site-packages/hdmf/backends/hdf5/h5tools.py", line 1443, in __list_fill__
      raise Exception(msg) from exc
  Exception: Could not create dataset data in /acquisition/rewards_combined with shape (210,), dtype <class 'numpy.void'>, and iosettings {}. Size must be positive (size must be positive)

When running on a Zarr NWB, writing completes, only showing

/opt/conda/lib/python3.9/site-packages/zarr/util.py:118: RuntimeWarning: divide by zero encountered in log10
  target_size = CHUNK_BASE * (2 ** np.log10(dset_size / (1024.0 * 1024)))

Then when reading the file later such like

nwb.acquisition['rewards_combined']

We get the error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/conda/lib/python3.12/site-packages/IPython/core/formatters.py:344, in BaseFormatter.__call__(self, obj)
    342     method = get_real_method(obj, self.print_method)
    343     if method is not None:
--> 344         return method()
    345     return None
    346 else:

File /opt/conda/lib/python3.12/site-packages/hdmf/container.py:692, in Container._repr_html_(self)
    690 html_repr += "<div class='container-wrap'>"
    691 html_repr += f"<div class='container-header'><div class='xr-obj-type'><h3>{header_text}</h3></div></div>"
--> 692 html_repr += self._generate_html_repr(self.fields, is_field=True)
    693 html_repr += "</div>"
    694 return html_repr

File /opt/conda/lib/python3.12/site-packages/hdmf/container.py:706, in Container._generate_html_repr(self, fields, level, access_code, is_field)
    704             html_repr += value._generate_field_html(key, value, level, current_access_code)
    705         else:
--> 706             html_repr += self._generate_field_html(key, value, level, current_access_code)
    707 elif isinstance(fields, list):
    708     for index, item in enumerate(fields):

File /opt/conda/lib/python3.12/site-packages/pynwb/base.py:342, in TimeSeries._generate_field_html(self, key, value, level, access_code)
    339     linked_key = 'timestamps' if key == 'timestamp_link' else 'data'
    340     value = [find_location_in_memory_nwbfile(linked_key, v) for v in value]
--> 342 return super()._generate_field_html(key, value, level, access_code)

File /opt/conda/lib/python3.12/site-packages/hdmf/container.py:733, in Container._generate_field_html(self, key, value, level, access_code)
    730 is_array_data = hasattr(value, "shape") and hasattr(value, "dtype")
    732 if is_array_data:
--> 733     html_content = self._generate_array_html(value, level + 1)
    734 elif hasattr(value, "generate_html_repr"):
    735     html_content = value.generate_html_repr(level + 1, access_code)

File /opt/conda/lib/python3.12/site-packages/hdmf/container.py:775, in Container._generate_array_html(self, array, level)
    769     repr_html = generate_array_html_repr(array_info_dict, array.data, "DataIO")
    770 elif it_was_read_with_io:
    771     # The backend handles the representation here. Two special cases worth noting:
    772     # 1. Array-type attributes (e.g., start_frame in ImageSeries) remain NumPy arrays
    773     #    even when their parent container has an IO
    774     # 2. Data may have been modified after being read from storage
--> 775     repr_html = read_io.generate_dataset_html(array)
    776 else:  # Not sure which object could get here
    777     object_class = array.__class__.__name__

File /opt/conda/lib/python3.12/site-packages/hdmf/backends/io.py:195, in HDMFIO.generate_dataset_html(dataset)
    193 """Generates an html representation for a dataset"""
    194 array_info_dict = get_basic_array_info(dataset)
--> 195 repr_html = generate_array_html_repr(array_info_dict, dataset)
    197 return repr_html

File /opt/conda/lib/python3.12/site-packages/hdmf/utils.py:1023, in generate_array_html_repr(array_info_dict, array, dataset_type)
   1021 array_is_small = array_size_bytes < 1024 * 0.1 # 10 % a kilobyte to display the array
   1022 if array_is_small:
-> 1023     repr_html += "<br>" + str(np.asarray(array))
   1025 return repr_html

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:589, in Array.__array__(self, *args)
    588 def __array__(self, *args):
--> 589     a = self[...]
    590     if args:
    591         a = a.astype(args[0])

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:807, in Array.__getitem__(self, selection)
    805     result = self.vindex[selection]
    806 else:
--> 807     result = self.get_basic_selection(pure_selection, fields=fields)
    808 return result

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:933, in Array.get_basic_selection(self, selection, out, fields)
    930     return self._get_basic_selection_zd(selection=selection, out=out,
    931                                         fields=fields)
    932 else:
--> 933     return self._get_basic_selection_nd(selection=selection, out=out,
    934                                         fields=fields)

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:976, in Array._get_basic_selection_nd(self, selection, out, fields)
    970 def _get_basic_selection_nd(self, selection, out=None, fields=None):
    971     # implementation of basic selection for array with at least one dimension
    972 
    973     # setup indexer
    974     indexer = BasicIndexer(selection, self)
--> 976     return self._get_selection(indexer=indexer, out=out, fields=fields)

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:1267, in Array._get_selection(self, indexer, out, fields)
   1261 if not hasattr(self.chunk_store, "getitems") or \
   1262    any(map(lambda x: x == 0, self.shape)):
   1263     # sequentially get one key at a time from storage
   1264     for chunk_coords, chunk_selection, out_selection in indexer:
   1265 
   1266         # load chunk selection into output array
-> 1267         self._chunk_getitem(chunk_coords, chunk_selection, out, out_selection,
   1268                             drop_axes=indexer.drop_axes, fields=fields)
   1269 else:
   1270     # allow storage to get multiple items at once
   1271     lchunk_coords, lchunk_selection, lout_selection = zip(*indexer)

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:1978, in Array._chunk_getitem(self, chunk_coords, chunk_selection, out, out_selection, drop_axes, fields)
   1975         out[out_selection] = fill_value
   1977 else:
-> 1978     self._process_chunk(out, cdata, chunk_selection, drop_axes,
   1979                         out_is_ndarray, fields, out_selection)

File /opt/conda/lib/python3.12/site-packages/zarr/core.py:1886, in Array._process_chunk(self, out, cdata, chunk_selection, drop_axes, out_is_ndarray, fields, out_selection, partial_read_decode)
   1884     if isinstance(cdata, PartialReadBuffer):
   1885         cdata = cdata.read_full()
-> 1886     self._compressor.decode(cdata, dest)
   1887 else:
   1888     chunk = ensure_ndarray_like(cdata).view(self._dtype)

File numcodecs/blosc.pyx:585, in numcodecs.blosc.Blosc.decode()

File numcodecs/blosc.pyx:393, in numcodecs.blosc.decompress()

ValueError: destination buffer too small; expected at least 1050, got 0

Operating System

Windows

Python Version

3.13

Package Versions

Writing Environment:
h5py==3.13.0
hdmf-zarr==0.11.1
numpy==1.26.4
pynwb==3.0.0

Reading environment:
hdmf==3.14.6
hdmf_zarr==0.9.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions