Deltares · veenstrajelmer · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -5,6 +5,7 @@ History
 UNRELEASED
 ----------
 * removed caching again in https://github.com/Deltares/ddlpy/pull/189
+* enforce conversion of strings to char arrays in `ddlpy.dataframe_to_xarray()` in  https://github.com/Deltares/ddlpy/pull/194
 
 0.10.0 (2025-12-23)
 -------------------

diff --git a/ddlpy/utils.py b/ddlpy/utils.py
@@ -97,9 +97,9 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
     Furthermore, all ".Omschrijving" variables are dropped and the information is added
     as attributes to the Code variables.
 
-    When writing the dataset to disk with ds.to_netcdf() it is recommended to use
-    `format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically
-    converts variables of dtype <U to |S which saves a lot of disk space for DDL data.
+    Lastly, all string variables are converted to char arrays to minimize filesizes when
+    writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are
+    always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC".
     """
 
     df_simple = simplify_dataframe(df, always_preserve=always_preserve)
@@ -127,4 +127,11 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
             omschrijving_vars.append(varn)
     ds = ds.drop_vars(omschrijving_vars)
 
+    # enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf
+    # char arrays are used per default with engine scipy/netcdf4_classic
+    for var in ds.data_vars:
+        if ds[var].dtype.kind == "O":
+            maxlen = int(ds[var].str.len().max())
+            ds[var].encoding = {"dtype": f"S{maxlen}"}
+
     return ds
diff --git a/docs/examples/retrieve_parallel_to_netcdf.py b/docs/examples/retrieve_parallel_to_netcdf.py
@@ -45,9 +45,7 @@ def get_data(location, start_date, end_date, dir_output, overwrite=True):
     ]
     ds = ddlpy.dataframe_to_xarray(measurements, always_preserve=always_preserve)
 
-    # write to netcdf file. NETCDF3_CLASSIC or NETCDF4_CLASSIC automatically converts
-    # variables of dtype <U to |S which saves a lot of disk space
-    ds.to_netcdf(filename, format="NETCDF4_CLASSIC")
+    ds.to_netcdf(filename)
 
 
 if __name__ == "__main__":

diff --git a/pyproject.toml b/pyproject.toml
@@ -72,6 +72,7 @@ examples = [
 ]
 netcdf = [
 	"xarray",
+	"scipy",
 	"h5netcdf[h5py]",
 	"netcdf4",
 ]

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -169,7 +169,7 @@ def test_dataframe_to_xarray_drop_omschrijving(measurements):
     assert ds["MeetApparaat.Code"].attrs == expected_attrs
 
 
-@pytest.mark.parametrize("engine", [None, "h5netcdf", "netcdf4", "netcdf4_classic"])
+@pytest.mark.parametrize("engine", [None, "scipy", "h5netcdf", "netcdf4", "netcdf4_classic"])
 def test_dataframe_to_xarray_to_netcdf(measurements, tmp_path, engine):
     ds_clean = ddlpy.dataframe_to_xarray(
         df=measurements,