From 7c39a761def86db0bbffb02206786a6505d4f9bd Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 12:25:57 +0200 Subject: [PATCH 1/7] enforce conversion of strings to char arrays in ddlpy.dataframe_to_xarray() --- ddlpy/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ddlpy/utils.py b/ddlpy/utils.py index 3166186..a8b9b36 100644 --- a/ddlpy/utils.py +++ b/ddlpy/utils.py @@ -127,4 +127,10 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): omschrijving_vars.append(varn) ds = ds.drop_vars(omschrijving_vars) + # enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf + # char arrays are used per default with engine scipy/netcdf4_classic + for var in ds.data_vars: + if ds[var].dtype.kind == "O": + ds[var].encoding = {"dtype": "S1"} + return ds From 6c835dead10eedc74e7c83b7fbfc56796b216c4b Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 12:26:51 +0200 Subject: [PATCH 2/7] added scipy engine to tests --- tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 73c3af6..1b93df5 100755 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -169,7 +169,7 @@ def test_dataframe_to_xarray_drop_omschrijving(measurements): assert ds["MeetApparaat.Code"].attrs == expected_attrs -@pytest.mark.parametrize("engine", [None, "h5netcdf", "netcdf4", "netcdf4_classic"]) +@pytest.mark.parametrize("engine", [None, "scipy", "h5netcdf", "netcdf4", "netcdf4_classic"]) def test_dataframe_to_xarray_to_netcdf(measurements, tmp_path, engine): ds_clean = ddlpy.dataframe_to_xarray( df=measurements, From 6b5e66c6bbc629c7769eda8ce4565354dc896fe0 Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 12:28:24 +0200 Subject: [PATCH 3/7] updated changelog --- HISTORY.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/HISTORY.rst b/HISTORY.rst index 87e1cb8..0304bf1 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -5,6 +5,7 @@ History UNRELEASED ---------- * removed caching again in https://github.com/Deltares/ddlpy/pull/189 +* enforce conversion of strings to char arrays in `ddlpy.dataframe_to_xarray()` in https://github.com/Deltares/ddlpy/pull/194 0.10.0 (2025-12-23) ------------------- From 4441ebdd4855904756bcdf59708f46ed962f44bd Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 12:31:02 +0200 Subject: [PATCH 4/7] remove outdated mentionings of netcdf4_classic --- ddlpy/utils.py | 8 ++++---- docs/examples/retrieve_parallel_to_netcdf.py | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/ddlpy/utils.py b/ddlpy/utils.py index a8b9b36..02617a3 100644 --- a/ddlpy/utils.py +++ b/ddlpy/utils.py @@ -96,10 +96,10 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): Furthermore, all ".Omschrijving" variables are dropped and the information is added as attributes to the Code variables. - - When writing the dataset to disk with ds.to_netcdf() it is recommended to use - `format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically - converts variables of dtype Date: Wed, 22 Apr 2026 12:38:37 +0200 Subject: [PATCH 5/7] added maxlen to make sure the strings are completely included --- ddlpy/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ddlpy/utils.py b/ddlpy/utils.py index 02617a3..4532c06 100644 --- a/ddlpy/utils.py +++ b/ddlpy/utils.py @@ -96,7 +96,7 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): Furthermore, all ".Omschrijving" variables are dropped and the information is added as attributes to the Code variables. - + Lastly, all string variables are converted to char arrays to save space when writing the netcdf with engines netcdf4/h5netcdf. Char arrays are used per default with engine scipy or engine netcdf4 with format="NETCDF4_CLASSIC". @@ -131,6 +131,7 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): # char arrays are used per default with engine scipy/netcdf4_classic for var in ds.data_vars: if ds[var].dtype.kind == "O": - ds[var].encoding = {"dtype": "S1"} + maxlen = int(ds[var].str.len().max()) + ds[var].encoding = {"dtype": f"S{maxlen}"} return ds From c2f9d3dfdcb64c7efa64d55fa7e715de2897f31b Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 12:41:35 +0200 Subject: [PATCH 6/7] added scipy --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b7ee1be..02b750c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,7 @@ examples = [ ] netcdf = [ "xarray", + "scipy", "h5netcdf[h5py]", "netcdf4", ] From 8105e8dfc58a7cfc6030188e4a01a9c8495b0e8b Mon Sep 17 00:00:00 2001 From: veenstrajelmer Date: Wed, 22 Apr 2026 13:21:24 +0200 Subject: [PATCH 7/7] update docstring --- ddlpy/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ddlpy/utils.py b/ddlpy/utils.py index 4532c06..d22e2cd 100644 --- a/ddlpy/utils.py +++ b/ddlpy/utils.py @@ -97,9 +97,9 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]): Furthermore, all ".Omschrijving" variables are dropped and the information is added as attributes to the Code variables. - Lastly, all string variables are converted to char arrays to save space when writing - the netcdf with engines netcdf4/h5netcdf. Char arrays are used per default with - engine scipy or engine netcdf4 with format="NETCDF4_CLASSIC". + Lastly, all string variables are converted to char arrays to minimize filesizes when + writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are + always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC". """ df_simple = simplify_dataframe(df, always_preserve=always_preserve)