diff --git a/Orange/data/io.py b/Orange/data/io.py index 0959bb725c2..be3dc74bca4 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -1,5 +1,6 @@ import contextlib import csv +import json import locale import pickle import re @@ -18,19 +19,22 @@ from urllib.request import urlopen, Request from pathlib import Path +import h5py import numpy as np +import pandas as pd import xlrd import xlsxwriter import openpyxl -from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin +from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin, DiscreteVariable, \ + TimeVariable, StringVariable from Orange.data import Compression, open_compressed, detect_encoding, \ isnastr, guess_data_type, sanitize_variable from Orange.data.io_base import FileFormatBase, Flags, DataTableMixin, PICKLE_PROTOCOL from Orange.util import flatten - +from Orange.version import short_version as ORANGE_VERSION # Support values longer than 128K (i.e. text contents features) csv.field_size_limit(100*1024*1024) @@ -511,3 +515,144 @@ def _suggest_filename(self, content_disposition): matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)", content_disposition or '') return urlunquote(matches[-1]) if matches else default_name + +class HDF5Reader(FileFormat): + """Reader for Orange HDF5 files""" + EXTENSIONS = ('.hdf5',) + DESCRIPTION = 'Orange on-disk data' + SUPPORT_COMPRESSED = False + SUPPORT_SPARSE_DATA = False + + def read(self): + def read_domain(sub): + d = f['domain'] + subdomain = d[sub].asstr() if sub in d else [] + subdomain_args = d[f'{sub}_args'].asstr() \ + if f'{sub}_args' in d else ['{}'] * len(subdomain) + for attr, args in zip(subdomain, subdomain_args): + yield attr[0], attr[1], json.loads(args) + + def make_var(name, header, args): + var_cls = [var for var in (ContinuousVariable, + DiscreteVariable, + StringVariable, + TimeVariable) if header in var.TYPE_HEADERS][0] + new_var = var_cls(name, **{key: val for key, val in args.items() + if key != "attributes"}) + new_var.attributes = args.get("attributes", {}) + return new_var + + def read_hdf5(name, as_str=False): + if name in f: + if as_str: + return f[name].asstr()[:] + return f[name] + return None + + with h5py.File(self.filename, "r") as f: + try: + assert f.attrs['creator'] == "Orange" + except KeyError: + assert 'domain' in f + + domain = Domain(*[[make_var(*args) for args in read_domain(subdomain)] + for subdomain in ['attributes', 'class_vars', 'metas']]) + + X = read_hdf5("X") + Y = read_hdf5("Y") + + if len(domain.metas) > 1: + metas = np.hstack([read_hdf5(f'metas/{i}', + isinstance(attr, StringVariable)) + for i, attr in enumerate(domain.metas)]) + elif len(domain.metas) == 1: + metas = read_hdf5('metas/0', + isinstance(domain.metas[0], StringVariable) + ) + else: + metas = None + + table = Table.from_numpy(domain, X, Y, metas) + if isinstance(self.filename, str): + table.name = path.splitext(path.split(self.filename)[-1])[0] + self.set_table_metadata(self.filename, table) + return table + + @classmethod + def write_file(cls, filename, data): + def parse(attr): + params = (attr.name, attr.TYPE_HEADERS[1], {"attributes": attr.attributes}) + if isinstance(attr, DiscreteVariable): + params[2].update(values=attr.values) + elif isinstance(attr, TimeVariable): + params[2].update(have_date=attr.have_date, + have_time=attr.have_time) + elif isinstance(attr, ContinuousVariable): + params[2].update(number_of_decimals=attr.number_of_decimals) + return params + + with h5py.File(filename, 'w') as f: + f.attrs['creator'] = "Orange" + f.attrs['Orange_version'] = ORANGE_VERSION + f.attrs['HDF5_Version'] = h5py.version.hdf5_version + f.attrs['h5py_version'] = h5py.version.version + str_dtype = h5py.string_dtype() + for subdomain in ['attributes', 'class_vars', 'metas']: + parsed = [parse(feature) for feature in getattr(data.domain, subdomain)] + domain = np.array([[name, header] for name, header, _ in parsed], dtype=str_dtype) + domain_args = np.array([json.dumps(args) for *_, args in parsed], dtype=str_dtype) + f.create_dataset(f'domain/{subdomain}', data=domain) + f.create_dataset(f'domain/{subdomain}_args', data=domain_args) + f.create_dataset("X", data=data.X) + if data.Y.size: + f.create_dataset("Y", data=data.Y) + if data.metas.size: + for i, attr in enumerate(data.domain.metas): + col_type = str_dtype if isinstance(attr, StringVariable) else 'f' + col_data = data.metas[:, [i]].astype(col_type) + if col_type != 'f': + col_data[pd.isnull(col_data)] = "" + f.create_dataset(f'metas/{i}', data=col_data, dtype=col_type) + cls.write_table_metadata(filename, data) + + @classmethod + def write_table_metadata(cls, filename, data): + dump_dict = {} + for key, value in data.attributes.items(): + if isinstance(value, str): + dump_dict[key] = value + else: + try: + dump_dict[key] = json.dumps(value) + except TypeError: + # value is not JSON serializable, fall back to pickle + dump_dict[key] = pickle.dumps(value, protocol=PICKLE_PROTOCOL).hex() + + with h5py.File(filename, 'r+') as f: + metadata_group = f.require_group('metadata') + str_dtype = h5py.string_dtype() + for key, value in dump_dict.items(): + metadata_group.create_dataset(key, data=value, dtype=str_dtype) + + @classmethod + def set_table_metadata(cls, filename, data): + with h5py.File(filename, 'r') as f: + if 'metadata' in f: + metadata_group = f['metadata'] + for key in metadata_group: + value = metadata_group[key][()] + if isinstance(value, bytes): + value = value.decode('utf-8') + if value.startswith('{') or value.startswith('['): + try: + value = json.loads(value) + except json.JSONDecodeError: + pass + elif value.startswith(f"80{PICKLE_PROTOCOL:02x}"): + try: + value = pickle.loads(bytes.fromhex(value)) + except (pickle.UnpicklingError, ValueError): + pass + data.attributes[key] = value + else: + super().set_table_metadata(filename, data) diff --git a/Orange/data/tests/test_io.py b/Orange/data/tests/test_io.py index 01187f26b30..5f0c904e177 100644 --- a/Orange/data/tests/test_io.py +++ b/Orange/data/tests/test_io.py @@ -6,9 +6,11 @@ from Orange.data import ContinuousVariable, DiscreteVariable, StringVariable, \ TimeVariable, Domain, Table -from Orange.data.io import TabReader, ExcelReader +from Orange.data.io import TabReader, ExcelReader, HDF5Reader +from Orange.data.io_base import PICKLE_PROTOCOL from Orange.data.io_util import guess_data_type from Orange.misc.collections import natural_sorted +from Orange.tests import named_file class TestTableFilters(unittest.TestCase): @@ -155,6 +157,95 @@ def test_roundtrip_xlsx(self): finally: os.remove(fname) + def test_roundtrip_hdf5(self): + with named_file('', suffix='.hdf5') as fn: + HDF5Reader.write(fn, self.data) + data = HDF5Reader(fn).read() + np.testing.assert_equal(data.X, self.data.X) + np.testing.assert_equal(data.Y, self.data.Y) + np.testing.assert_equal(data.metas[:2], self.data.metas[:2]) + self.assertEqual(data.metas[2, 0], "") + np.testing.assert_equal(data.domain, self.data.domain) + + +class Unserializable: + def __init__(self, name): + self.name = name + + +class TestWriterMetadata(unittest.TestCase): + def setUp(self): + TestWriters.setUp(self) + self.data.attributes.update({ + "Name": "Test dataset", + "Description": "This is a test dataset.", + "Author": "Unit Tester", + "Year": "2024", + "Reference": "None" + }) + + def test_metadata_string(self): + with NamedTemporaryFile(suffix=".tab", delete=False) as f: + fname = f.name + try: + TabReader.write(fname, self.data) + with open(fname + ".metadata", encoding="utf-8") as f: + content = f.read() + self.assertIn("Name: Test dataset", content) + self.assertIn("Description: This is a test dataset.", content) + self.assertIn("Author: Unit Tester", content) + self.assertIn("Year: 2024", content) + self.assertIn("Reference: None", content) + table = TabReader(fname).read() + self.assertEqual(table.attributes, self.data.attributes) + finally: + os.remove(fname) + os.remove(fname + ".metadata") + + def test_metadata_pickle(self): + data = self.data.copy() + data.attributes["CustomAttr"] = {"key1": "value1", "key2": 2} + with NamedTemporaryFile(suffix=".tab", delete=False) as f: + fname = f.name + try: + TabReader.write(fname, data) + with open(fname + ".metadata", 'rb') as f: + pickle = f.read(2) + self.assertEqual(pickle, b'\x80' + bytes([PICKLE_PROTOCOL])) + table = TabReader(fname).read() + self.assertEqual(table.attributes, data.attributes) + finally: + os.remove(fname) + os.remove(fname + ".metadata") + + def test_metadata_hdf5(self): + data = self.data.copy() + data.attributes["CustomAttr"] = {"key1": "value1", "key2": 2} + with NamedTemporaryFile(suffix=".hdf5", delete=False) as f: + fname = f.name + try: + HDF5Reader.write(fname, data) + table = HDF5Reader(fname).read() + self.assertEqual(table.attributes, data.attributes) + finally: + os.remove(fname) + + def test_metadata_hdf5_pickle(self): + data = self.data.copy() + data.attributes["Unserializable"] = Unserializable(name="test") + with NamedTemporaryFile(suffix=".hdf5", delete=False) as f: + fname = f.name + try: + HDF5Reader.write(fname, data) + table = HDF5Reader(fname).read() + for key, value in table.attributes.items(): + if isinstance(value, Unserializable): + self.assertIsInstance(data.attributes[key], Unserializable) + else: + self.assertEqual(value, data.attributes[key]) + finally: + os.remove(fname) + if __name__ == "__main__": unittest.main() diff --git a/requirements-core.txt b/requirements-core.txt index eb52c628c69..81ae80b515d 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -25,3 +25,5 @@ xgboost>=1.7.4; sys_platform!="darwin" xlrd>=1.2.0 # Writing Excel Files xlsxwriter +# HDF5 binary data format +h5py \ No newline at end of file