From 7c39a761def86db0bbffb02206786a6505d4f9bd Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:25:57 +0200
Subject: [PATCH 1/7] enforce conversion of strings to char arrays in
 ddlpy.dataframe_to_xarray()

---
 ddlpy/utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ddlpy/utils.py b/ddlpy/utils.py
index 3166186..a8b9b36 100644
--- a/ddlpy/utils.py
+++ b/ddlpy/utils.py
@@ -127,4 +127,10 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
             omschrijving_vars.append(varn)
     ds = ds.drop_vars(omschrijving_vars)
 
+    # enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf
+    # char arrays are used per default with engine scipy/netcdf4_classic
+    for var in ds.data_vars:
+        if ds[var].dtype.kind == "O":
+            ds[var].encoding = {"dtype": "S1"}
+
     return ds

From 6c835dead10eedc74e7c83b7fbfc56796b216c4b Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:26:51 +0200
Subject: [PATCH 2/7] added scipy engine to tests

---
 tests/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index 73c3af6..1b93df5 100755
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -169,7 +169,7 @@ def test_dataframe_to_xarray_drop_omschrijving(measurements):
     assert ds["MeetApparaat.Code"].attrs == expected_attrs
 
 
-@pytest.mark.parametrize("engine", [None, "h5netcdf", "netcdf4", "netcdf4_classic"])
+@pytest.mark.parametrize("engine", [None, "scipy", "h5netcdf", "netcdf4", "netcdf4_classic"])
 def test_dataframe_to_xarray_to_netcdf(measurements, tmp_path, engine):
     ds_clean = ddlpy.dataframe_to_xarray(
         df=measurements,

From 6b5e66c6bbc629c7769eda8ce4565354dc896fe0 Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:28:24 +0200
Subject: [PATCH 3/7] updated changelog

---
 HISTORY.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/HISTORY.rst b/HISTORY.rst
index 87e1cb8..0304bf1 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -5,6 +5,7 @@ History
 UNRELEASED
 ----------
 * removed caching again in https://github.com/Deltares/ddlpy/pull/189
+* enforce conversion of strings to char arrays in `ddlpy.dataframe_to_xarray()` in  https://github.com/Deltares/ddlpy/pull/194
 
 0.10.0 (2025-12-23)
 -------------------

From 4441ebdd4855904756bcdf59708f46ed962f44bd Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:31:02 +0200
Subject: [PATCH 4/7] remove outdated mentionings of netcdf4_classic

---
 ddlpy/utils.py                               | 8 ++++----
 docs/examples/retrieve_parallel_to_netcdf.py | 4 +---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/ddlpy/utils.py b/ddlpy/utils.py
index a8b9b36..02617a3 100644
--- a/ddlpy/utils.py
+++ b/ddlpy/utils.py
@@ -96,10 +96,10 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
 
     Furthermore, all ".Omschrijving" variables are dropped and the information is added
     as attributes to the Code variables.
-
-    When writing the dataset to disk with ds.to_netcdf() it is recommended to use
-    `format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically
-    converts variables of dtype <U to |S which saves a lot of disk space for DDL data.
+    
+    Lastly, all string variables are converted to char arrays to save space when writing
+    the netcdf with engines netcdf4/h5netcdf. Char arrays are used per default with
+    engine scipy or engine netcdf4 with format="NETCDF4_CLASSIC".
     """
 
     df_simple = simplify_dataframe(df, always_preserve=always_preserve)
diff --git a/docs/examples/retrieve_parallel_to_netcdf.py b/docs/examples/retrieve_parallel_to_netcdf.py
index 34cde0c..a4ce448 100644
--- a/docs/examples/retrieve_parallel_to_netcdf.py
+++ b/docs/examples/retrieve_parallel_to_netcdf.py
@@ -45,9 +45,7 @@ def get_data(location, start_date, end_date, dir_output, overwrite=True):
     ]
     ds = ddlpy.dataframe_to_xarray(measurements, always_preserve=always_preserve)
 
-    # write to netcdf file. NETCDF3_CLASSIC or NETCDF4_CLASSIC automatically converts
-    # variables of dtype <U to |S which saves a lot of disk space
-    ds.to_netcdf(filename, format="NETCDF4_CLASSIC")
+    ds.to_netcdf(filename)
 
 
 if __name__ == "__main__":

From ca23f8fafd57698ea01756fb93dafe882a5706c2 Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:38:37 +0200
Subject: [PATCH 5/7] added maxlen to make sure the strings are completely
 included

---
 ddlpy/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ddlpy/utils.py b/ddlpy/utils.py
index 02617a3..4532c06 100644
--- a/ddlpy/utils.py
+++ b/ddlpy/utils.py
@@ -96,7 +96,7 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
 
     Furthermore, all ".Omschrijving" variables are dropped and the information is added
     as attributes to the Code variables.
-    
+
     Lastly, all string variables are converted to char arrays to save space when writing
     the netcdf with engines netcdf4/h5netcdf. Char arrays are used per default with
     engine scipy or engine netcdf4 with format="NETCDF4_CLASSIC".
@@ -131,6 +131,7 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
     # char arrays are used per default with engine scipy/netcdf4_classic
     for var in ds.data_vars:
         if ds[var].dtype.kind == "O":
-            ds[var].encoding = {"dtype": "S1"}
+            maxlen = int(ds[var].str.len().max())
+            ds[var].encoding = {"dtype": f"S{maxlen}"}
 
     return ds

From c2f9d3dfdcb64c7efa64d55fa7e715de2897f31b Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 12:41:35 +0200
Subject: [PATCH 6/7] added scipy

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index b7ee1be..02b750c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,7 @@ examples = [
 ]
 netcdf = [
 	"xarray",
+	"scipy",
 	"h5netcdf[h5py]",
 	"netcdf4",
 ]

From 8105e8dfc58a7cfc6030188e4a01a9c8495b0e8b Mon Sep 17 00:00:00 2001
From: veenstrajelmer <veenstrajelmer@gmail.com>
Date: Wed, 22 Apr 2026 13:21:24 +0200
Subject: [PATCH 7/7] update docstring

---
 ddlpy/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ddlpy/utils.py b/ddlpy/utils.py
index 4532c06..d22e2cd 100644
--- a/ddlpy/utils.py
+++ b/ddlpy/utils.py
@@ -97,9 +97,9 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
     Furthermore, all ".Omschrijving" variables are dropped and the information is added
     as attributes to the Code variables.
 
-    Lastly, all string variables are converted to char arrays to save space when writing
-    the netcdf with engines netcdf4/h5netcdf. Char arrays are used per default with
-    engine scipy or engine netcdf4 with format="NETCDF4_CLASSIC".
+    Lastly, all string variables are converted to char arrays to minimize filesizes when
+    writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are
+    always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC".
     """
 
     df_simple = simplify_dataframe(df, always_preserve=always_preserve)