From 5de5214a09197cea4b2093844c07f7007b39d4c9 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:32:38 -0600 Subject: [PATCH 01/13] io: Table-only version of Orange on-disk format (HDF5) --- Orange/data/io.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 0959bb725c2..4c3de870043 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -1,5 +1,6 @@ import contextlib import csv +import json import locale import pickle import re @@ -18,13 +19,15 @@ from urllib.request import urlopen, Request from pathlib import Path +import h5py import numpy as np import xlrd import xlsxwriter import openpyxl -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \ + StringVariable from Orange.data import Compression, open_compressed, detect_encoding, \ isnastr, guess_data_type, sanitize_variable from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL @@ -511,3 +514,93 @@ def _suggest_filename(self, content_disposition): matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)", content_disposition or '') return urlunquote(matches[-1]) if matches else default_name + +class HDF5Reader(FileFormat): + """Reader for Orange HDF5 files""" + EXTENSIONS = ('.hdf5',) + DESCRIPTION = 'Orange on-disk data' + SUPPORT_COMPRESSED = False + SUPPORT_SPARSE_DATA = False + + def read(self): + h5file = f = h5py.File(self.filename, "r") + + def read_domain(sub): + d = f['domain'] + subdomain = d[sub].asstr() if sub in d else [] + subdomain_args = d[f'{sub}_args'].asstr() \ + if f'{sub}_args' in d else ['{}'] * len(subdomain) + for attr, args in zip(subdomain, subdomain_args): + yield attr[0], attr[1], json.loads(args) + + def make_var(name, header, args): + var_cls = [var for var in (ContinuousVariable, + DiscreteVariable, + StringVariable, + TimeVariable) if header in var.TYPE_HEADERS][0] + new_var = var_cls(name, **{key: val for key, val in args.items() + if key != "attributes"}) + new_var.attributes = args.get("attributes", {}) + return new_var + + def read_hdf5(name, as_str=False): + if name in f: + if as_str: + return f[name].asstr()[:] + return f[name] + return None + + assert 'domain' in f + + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] + for subdomain in ['attributes', 'class_vars', 'metas']]) + + X = read_hdf5("X") + Y = read_hdf5("Y") + + + if len(domain.metas) > 1: + metas = np.hstack([read_hdf5(f'metas/{i}', + isinstance(attr, StringVariable)) + for i, attr in enumerate(domain.metas)]) + elif len(domain.metas) == 1: + metas = read_hdf5('metas/0', + isinstance(domain.metas[0], StringVariable) + ) + else: + metas = None + + table = Table.from_numpy(domain, X, Y, metas) + if isinstance(self.filename, str): + table.name = path.splitext(path.split(self.filename)[-1])[0] + + return table + + @classmethod + def write_file(cls, filename, data): + def parse(attr): + params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes}) + if isinstance(attr, DiscreteVariable): + params[2].update(values=attr.values) + elif isinstance(attr, TimeVariable): + params[2].update(have_date=attr.have_date, + have_time=attr.have_time) + elif isinstance(attr, ContinuousVariable): + params[2].update(number_of_decimals=attr.number_of_decimals) + return params + + with h5py.File(filename, 'w') as f: + for subdomain in ['attributes', 'class_vars', 'metas']: + parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] + domain = np.array([[name, header] for name, header, _ in parsed], 'S') + domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S') + f.create_dataset(f'domain/{subdomain}', data=domain) + f.create_dataset(f'domain/{subdomain}_args', data=domain_args) + f.create_dataset("X", data=data.X) + if data.Y.size: + f.create_dataset("Y", data=data.Y) + if data.metas.size: + for i, attr in enumerate(data.domain.metas): + col_type = 'S' if isinstance(attr, StringVariable) else 'f' + col_data = data.metas[:, [i]].astype(col_type) + f.create_dataset(f'metas/{i}', data=col_data) From 562704a3b79b6d2a6b34fca25f18afd10e54051f Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:36:26 -0600 Subject: [PATCH 02/13] Add h5py dependency --- requirements-core.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-core.txt b/requirements-core.txt index eb52c628c69..81ae80b515d 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -25,3 +25,5 @@ xgboost>=1.7.4; sys_platform!="darwin" xlrd>=1.2.0 # Writing Excel Files xlsxwriter +# HDF5 binary data format +h5py \ No newline at end of file From 0e77deee09ac43ed264e3fbcf12b6cc61b0694ba Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 11:55:16 -0600 Subject: [PATCH 03/13] io.test: roundtrip test for Orange HDF5 --- Orange/data/tests/test_io.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 01187f26b30..21dec385dfa 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -6,9 +6,10 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable, Domain, Table -from Orange.data.io import TabReader, ExcelReader +from Orange.data.io import TabReader, ExcelReader, HDF5Reader from Orange.data.io_util import guess_data_type from Orange.misc.collections import natural_sorted +from Orange.tests import named_file class TestTableFilters(unittest.TestCase): @@ -155,6 +156,16 @@ def test_roundtrip_xlsx(self): finally: os.remove(fname) + def test_roundtrip_hdf5(self): + with named_file('', suffix='.hdf5') as fn: + HDF5Reader.write(fn, self.data) + data = HDF5Reader(fn).read() + np.testing.assert_equal(data.X, self.data.X) + np.testing.assert_equal(data.Y, self.data.Y) + np.testing.assert_equal(data.metas[:2], self.data.metas[:2]) + self.assertEqual(data.metas[2, 0], "") + np.testing.assert_equal(data.domain, self.data.domain) + if __name__ == "__main__": unittest.main() From c58913c7797967c89f6e9102d918cc64fb86ec41 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 7 Mar 2024 14:15:24 -0600 Subject: [PATCH 04/13] Fix NaN handling for strings (ref #6670) In fixing this, switched string handling from fixed-length to variable length https://docs.h5py.org/en/stable/special.html#variable-length-strings --- Orange/data/io.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 4c3de870043..8de8a08031d 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -21,6 +21,7 @@ import h5py import numpy as np +import pandas as pd import xlrd import xlsxwriter @@ -601,6 +602,8 @@ def parse(attr): f.create_dataset("Y", data=data.Y) if data.metas.size: for i, attr in enumerate(data.domain.metas): - col_type = 'S' if isinstance(attr, StringVariable) else 'f' + col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) - f.create_dataset(f'metas/{i}', data=col_data) + if col_type is not 'f': + col_data[pd.isnull(col_data)] = "" + f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) From 0cc213ccd1d44d8cae8db03ebcab5a02e55a2e03 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Fri, 26 Apr 2024 11:00:07 -0600 Subject: [PATCH 05/13] Add Orange identifier and versions as attributes Fix small 'is not' bug --- Orange/data/io.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 8de8a08031d..d1f7bfa3a79 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -34,7 +34,7 @@ from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL from Orange.util import flatten - +from Orange.version import short_version as ORANGE_VERSION # Support values longer than 128K (i.e. text contents features) csv.field_size_limit(100*1024*1024) @@ -551,7 +551,10 @@ def read_hdf5(name, as_str=False): return f[name] return None - assert 'domain' in f + try: + assert f.attrs['creator'] == "Orange" + except KeyError: + assert 'domain' in f domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] for subdomain in ['attributes', 'class_vars', 'metas']]) @@ -591,6 +594,10 @@ def parse(attr): return params with h5py.File(filename, 'w') as f: + f.attrs['creator'] = "Orange" + f.attrs['Orange_version'] = ORANGE_VERSION + f.attrs['HDF5_Version'] = h5py.version.hdf5_version + f.attrs['h5py_version'] = h5py.version.version for subdomain in ['attributes', 'class_vars', 'metas']: parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] domain = np.array([[name, header] for name, header, _ in parsed], 'S') @@ -604,6 +611,6 @@ def parse(attr): for i, attr in enumerate(data.domain.metas): col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) - if col_type is not 'f': + if col_type != 'f': col_data[pd.isnull(col_data)] = "" f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) From c1128a9eeef26280bc5da5ec60e87d73a444e087 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 2 May 2024 13:55:34 -0600 Subject: [PATCH 06/13] io: Use existing .metadata sidecar to hold Table.attributes --- Orange/data/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index d1f7bfa3a79..298b4e1ba36 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -577,7 +577,7 @@ def read_hdf5(name, as_str=False): table = Table.from_numpy(domain, X, Y, metas) if isinstance(self.filename, str): table.name = path.splitext(path.split(self.filename)[-1])[0] - + self.set_table_metadata(self.filename, table) return table @classmethod @@ -614,3 +614,4 @@ def parse(attr): if col_type != 'f': col_data[pd.isnull(col_data)] = "" f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) + cls.write_table_metadata(filename, data) From 9f5e620c82a657b467a0f4e6bdb0a48ee4c430d2 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 2 May 2024 14:01:55 -0600 Subject: [PATCH 07/13] io: use context manager for HDF5 reading --- Orange/data/io.py | 52 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 298b4e1ba36..534fb8b0170 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -524,8 +524,6 @@ class HDF5Reader(FileFormat): SUPPORT_SPARSE_DATA = False def read(self): - h5file = f = h5py.File(self.filename, "r") - def read_domain(sub): d = f['domain'] subdomain = d[sub].asstr() if sub in d else [] @@ -551,32 +549,32 @@ def read_hdf5(name, as_str=False): return f[name] return None - try: - assert f.attrs['creator'] == "Orange" - except KeyError: - assert 'domain' in f - - domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] - for subdomain in ['attributes', 'class_vars', 'metas']]) - - X = read_hdf5("X") - Y = read_hdf5("Y") - - - if len(domain.metas) > 1: - metas = np.hstack([read_hdf5(f'metas/{i}', - isinstance(attr, StringVariable)) - for i, attr in enumerate(domain.metas)]) - elif len(domain.metas) == 1: - metas = read_hdf5('metas/0', - isinstance(domain.metas[0], StringVariable) - ) - else: - metas = None + with h5py.File(self.filename, "r") as f: + try: + assert f.attrs['creator'] == "Orange" + except KeyError: + assert 'domain' in f + + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] + for subdomain in ['attributes', 'class_vars', 'metas']]) + + X = read_hdf5("X") + Y = read_hdf5("Y") + + if len(domain.metas) > 1: + metas = np.hstack([read_hdf5(f'metas/{i}', + isinstance(attr, StringVariable)) + for i, attr in enumerate(domain.metas)]) + elif len(domain.metas) == 1: + metas = read_hdf5('metas/0', + isinstance(domain.metas[0], StringVariable) + ) + else: + metas = None - table = Table.from_numpy(domain, X, Y, metas) - if isinstance(self.filename, str): - table.name = path.splitext(path.split(self.filename)[-1])[0] + table = Table.from_numpy(domain, X, Y, metas) + if isinstance(self.filename, str): + table.name = path.splitext(path.split(self.filename)[-1])[0] self.set_table_metadata(self.filename, table) return table From a81d91f67215c3011670a1b928aa6e2f7a424c6d Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Mon, 27 May 2024 09:21:02 -0600 Subject: [PATCH 08/13] lint --- Orange/data/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 534fb8b0170..4f0e7b29f05 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -27,8 +27,8 @@ import xlsxwriter import openpyxl -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, TimeVariable, \ - StringVariable +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, \ + TimeVariable, StringVariable from Orange.data import Compression, open_compressed, detect_encoding, \ isnastr, guess_data_type, sanitize_variable from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL From 52678c4da74e73ae7ef76a668def7b2d7c278de7 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Fri, 27 Sep 2024 14:36:30 -0600 Subject: [PATCH 09/13] Use h5py.string_dtype() for all string arrays --- Orange/data/io.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index 4f0e7b29f05..97251bd3d0a 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -596,10 +596,11 @@ def parse(attr): f.attrs['Orange_version'] = ORANGE_VERSION f.attrs['HDF5_Version'] = h5py.version.hdf5_version f.attrs['h5py_version'] = h5py.version.version + str_dtype = h5py.string_dtype() for subdomain in ['attributes', 'class_vars', 'metas']: parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] - domain = np.array([[name, header] for name, header, _ in parsed], 'S') - domain_args = np.array([json.dumps(args) for *_, args in parsed], 'S') + domain = np.array([[name, header] for name, header, _ in parsed], dtype=str_dtype) + domain_args = np.array([json.dumps(args) for *_, args in parsed], dtype=str_dtype) f.create_dataset(f'domain/{subdomain}', data=domain) f.create_dataset(f'domain/{subdomain}_args', data=domain_args) f.create_dataset("X", data=data.X) @@ -607,7 +608,7 @@ def parse(attr): f.create_dataset("Y", data=data.Y) if data.metas.size: for i, attr in enumerate(data.domain.metas): - col_type = h5py.string_dtype() if isinstance(attr, StringVariable) else 'f' + col_type = str_dtype if isinstance(attr, StringVariable) else 'f' col_data = data.metas[:, [i]].astype(col_type) if col_type != 'f': col_data[pd.isnull(col_data)] = "" From 6f45447cb31439b591511f11b098018cf9ca4d97 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 5 Feb 2026 15:37:18 -0600 Subject: [PATCH 10/13] io: Test metadata writer --- Orange/data/tests/test_io.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 21dec385dfa..333f9f0f069 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -7,6 +7,7 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable, Domain, Table from Orange.data.io import TabReader, ExcelReader, HDF5Reader +from Orange.data.io_base import PICKLE_PROTOCOL from Orange.data.io_util import guess_data_type from Orange.misc.collections import natural_sorted from Orange.tests import named_file @@ -167,5 +168,51 @@ def test_roundtrip_hdf5(self): np.testing.assert_equal(data.domain, self.data.domain) +class TestWriterMetadata(unittest.TestCase): + def setUp(self): + TestWriters.setUp(self) + self.data.attributes.update({ + "Name": "Test dataset", + "Description": "This is a test dataset.", + "Author": "Unit Tester", + "Year": "2024", + "Reference": "None" + }) + + def test_metadata_string(self): + with NamedTemporaryFile(suffix=".tab", delete=False) as f: + fname = f.name + try: + TabReader.write(fname, self.data) + with open(fname + ".metadata", encoding="utf-8") as f: + content = f.read() + self.assertIn("Name: Test dataset", content) + self.assertIn("Description: This is a test dataset.", content) + self.assertIn("Author: Unit Tester", content) + self.assertIn("Year: 2024", content) + self.assertIn("Reference: None", content) + table = TabReader(fname).read() + self.assertEqual(table.attributes, self.data.attributes) + finally: + os.remove(fname) + os.remove(fname + ".metadata") + + def test_metadata_pickle(self): + data = self.data.copy() + data.attributes["CustomAttr"] = {"key1": "value1", "key2": 2} + with NamedTemporaryFile(suffix=".tab", delete=False) as f: + fname = f.name + try: + TabReader.write(fname, data) + with open(fname + ".metadata", 'rb') as f: + pickle = f.read(2) + self.assertEqual(pickle, b'\x80' + bytes([PICKLE_PROTOCOL])) + table = TabReader(fname).read() + self.assertEqual(table.attributes, data.attributes) + finally: + os.remove(fname) + os.remove(fname + ".metadata") + + if __name__ == "__main__": unittest.main() From 7c9103d87948d9925dda866808ea9edb6fa155f0 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 5 Feb 2026 16:12:47 -0600 Subject: [PATCH 11/13] io: HDF5Reader handle metadata without external file --- Orange/data/io.py | 26 ++++++++++++++++++++++++++ Orange/data/tests/test_io.py | 12 ++++++++++++ 2 files changed, 38 insertions(+) diff --git a/Orange/data/io.py b/Orange/data/io.py index 97251bd3d0a..e63a714507b 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -614,3 +614,29 @@ def parse(attr): col_data[pd.isnull(col_data)] = "" f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) cls.write_table_metadata(filename, data) + + @classmethod + def write_table_metadata(cls, filename, data): + with h5py.File(filename, 'r+') as f: + metadata_group = f.require_group('metadata') + str_dtype = h5py.string_dtype() + for key, value in data.attributes.items(): + if not isinstance(value, str): + value = json.dumps(value) + metadata_group.create_dataset(key, data=value, dtype=str_dtype) + + @classmethod + def set_table_metadata(cls, filename, data): + with h5py.File(filename, 'r') as f: + if 'metadata' in f: + metadata_group = f['metadata'] + for key in metadata_group: + value = metadata_group[key][()] + if isinstance(value, bytes): + value = value.decode('utf-8') + if value.startswith('{') or value.startswith('['): + try: + value = json.loads(value) + except json.JSONDecodeError: + pass + data.attributes[key] = value diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 333f9f0f069..271e69c7634 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -213,6 +213,18 @@ def test_metadata_pickle(self): os.remove(fname) os.remove(fname + ".metadata") + def test_metadata_hdf5(self): + data = self.data.copy() + data.attributes["CustomAttr"] = {"key1": "value1", "key2": 2} + with NamedTemporaryFile(suffix=".hdf5", delete=False) as f: + fname = f.name + try: + HDF5Reader.write(fname, data) + table = HDF5Reader(fname).read() + self.assertEqual(table.attributes, data.attributes) + finally: + os.remove(fname) + if __name__ == "__main__": unittest.main() From e873662388d8b16332fb5c72b114030527f56272 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Fri, 6 Feb 2026 11:20:13 -0600 Subject: [PATCH 12/13] io: HDF5Reader add pickle .metadata fallback --- Orange/data/io.py | 21 ++++++++++++++------- Orange/data/tests/test_io.py | 22 ++++++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index e63a714507b..ab4b5458851 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -617,13 +617,18 @@ def parse(attr): @classmethod def write_table_metadata(cls, filename, data): - with h5py.File(filename, 'r+') as f: - metadata_group = f.require_group('metadata') - str_dtype = h5py.string_dtype() - for key, value in data.attributes.items(): - if not isinstance(value, str): - value = json.dumps(value) - metadata_group.create_dataset(key, data=value, dtype=str_dtype) + try: + dump_dict = {key: value if isinstance(value, str) else json.dumps(value) + for key, value in data.attributes.items()} + except TypeError: + # some attribute is not JSON serializable, fall back to pickle .metadata file + super().write_table_metadata(filename, data) + else: + with h5py.File(filename, 'r+') as f: + metadata_group = f.require_group('metadata') + str_dtype = h5py.string_dtype() + for key, value in dump_dict.items(): + metadata_group.create_dataset(key, data=value, dtype=str_dtype) @classmethod def set_table_metadata(cls, filename, data): @@ -640,3 +645,5 @@ def set_table_metadata(cls, filename, data): except json.JSONDecodeError: pass data.attributes[key] = value + else: + super().set_table_metadata(filename, data) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 271e69c7634..accffab223b 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -168,6 +168,11 @@ def test_roundtrip_hdf5(self): np.testing.assert_equal(data.domain, self.data.domain) +class Unserializable: + def __init__(self, name): + self.name = name + + class TestWriterMetadata(unittest.TestCase): def setUp(self): TestWriters.setUp(self) @@ -225,6 +230,23 @@ def test_metadata_hdf5(self): finally: os.remove(fname) + def test_metadata_hdf5_pickle(self): + data = self.data.copy() + data.attributes["Unserializable"] = Unserializable(name="test") + with NamedTemporaryFile(suffix=".hdf5", delete=False) as f: + fname = f.name + try: + HDF5Reader.write(fname, data) + table = HDF5Reader(fname).read() + for key, value in table.attributes.items(): + if isinstance(value, Unserializable): + self.assertIsInstance(data.attributes[key], Unserializable) + else: + self.assertEqual(value, data.attributes[key]) + finally: + os.remove(fname) + os.remove(fname + ".metadata") + if __name__ == "__main__": unittest.main() From d514d66ed4f51d92ac01afeded34f7a57b1ded42 Mon Sep 17 00:00:00 2001 From: Stuart Read Date: Thu, 5 Mar 2026 15:23:15 -0600 Subject: [PATCH 13/13] io: HDF5Reader save pickle fallback in hdf5 file (no .metadata sidecar) --- Orange/data/io.py | 33 +++++++++++++++++++++------------ Orange/data/tests/test_io.py | 1 - 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/Orange/data/io.py b/Orange/data/io.py index ab4b5458851..be3dc74bca4 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -617,18 +617,22 @@ def parse(attr): @classmethod def write_table_metadata(cls, filename, data): - try: - dump_dict = {key: value if isinstance(value, str) else json.dumps(value) - for key, value in data.attributes.items()} - except TypeError: - # some attribute is not JSON serializable, fall back to pickle .metadata file - super().write_table_metadata(filename, data) - else: - with h5py.File(filename, 'r+') as f: - metadata_group = f.require_group('metadata') - str_dtype = h5py.string_dtype() - for key, value in dump_dict.items(): - metadata_group.create_dataset(key, data=value, dtype=str_dtype) + dump_dict = {} + for key, value in data.attributes.items(): + if isinstance(value, str): + dump_dict[key] = value + else: + try: + dump_dict[key] = json.dumps(value) + except TypeError: + # value is not JSON serializable, fall back to pickle + dump_dict[key] = pickle.dumps(value, protocol=PICKLE_PROTOCOL).hex() + + with h5py.File(filename, 'r+') as f: + metadata_group = f.require_group('metadata') + str_dtype = h5py.string_dtype() + for key, value in dump_dict.items(): + metadata_group.create_dataset(key, data=value, dtype=str_dtype) @classmethod def set_table_metadata(cls, filename, data): @@ -644,6 +648,11 @@ def set_table_metadata(cls, filename, data): value = json.loads(value) except json.JSONDecodeError: pass + elif value.startswith(f"80{PICKLE_PROTOCOL:02x}"): + try: + value = pickle.loads(bytes.fromhex(value)) + except (pickle.UnpicklingError, ValueError): + pass data.attributes[key] = value else: super().set_table_metadata(filename, data) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index accffab223b..5f0c904e177 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -245,7 +245,6 @@ def test_metadata_hdf5_pickle(self): self.assertEqual(value, data.attributes[key]) finally: os.remove(fname) - os.remove(fname + ".metadata") if __name__ == "__main__":