Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ History
UNRELEASED
----------
* removed caching again in https://github.com/Deltares/ddlpy/pull/189
* enforce conversion of strings to char arrays in `ddlpy.dataframe_to_xarray()` in https://github.com/Deltares/ddlpy/pull/194

0.10.0 (2025-12-23)
-------------------
Expand Down
13 changes: 10 additions & 3 deletions ddlpy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
Furthermore, all ".Omschrijving" variables are dropped and the information is added
as attributes to the Code variables.

When writing the dataset to disk with ds.to_netcdf() it is recommended to use
`format="NETCDF3_CLASSIC"` or `format="NETCDF4_CLASSIC"` since this automatically
converts variables of dtype <U to |S which saves a lot of disk space for DDL data.
Lastly, all string variables are converted to char arrays to minimize filesizes when
writing the netcdf with engine="netcdf4" or engine="h5netcdf". Char arrays are
always used with engine="scipy" or engine="netcdf4" with format="NETCDF4_CLASSIC".
"""

df_simple = simplify_dataframe(df, always_preserve=always_preserve)
Expand Down Expand Up @@ -127,4 +127,11 @@ def dataframe_to_xarray(df: pd.DataFrame, always_preserve=[]):
omschrijving_vars.append(varn)
ds = ds.drop_vars(omschrijving_vars)

# enforce char arrays to reduce filesize for strings with engine netcdf4/h5netcdf
# char arrays are used per default with engine scipy/netcdf4_classic
for var in ds.data_vars:
if ds[var].dtype.kind == "O":
maxlen = int(ds[var].str.len().max())
ds[var].encoding = {"dtype": f"S{maxlen}"}

return ds
4 changes: 1 addition & 3 deletions docs/examples/retrieve_parallel_to_netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@ def get_data(location, start_date, end_date, dir_output, overwrite=True):
]
ds = ddlpy.dataframe_to_xarray(measurements, always_preserve=always_preserve)

# write to netcdf file. NETCDF3_CLASSIC or NETCDF4_CLASSIC automatically converts
# variables of dtype <U to |S which saves a lot of disk space
ds.to_netcdf(filename, format="NETCDF4_CLASSIC")
ds.to_netcdf(filename)


if __name__ == "__main__":
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ examples = [
]
netcdf = [
"xarray",
"scipy",
"h5netcdf[h5py]",
"netcdf4",
]
Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def test_dataframe_to_xarray_drop_omschrijving(measurements):
assert ds["MeetApparaat.Code"].attrs == expected_attrs


@pytest.mark.parametrize("engine", [None, "h5netcdf", "netcdf4", "netcdf4_classic"])
@pytest.mark.parametrize("engine", [None, "scipy", "h5netcdf", "netcdf4", "netcdf4_classic"])
def test_dataframe_to_xarray_to_netcdf(measurements, tmp_path, engine):
ds_clean = ddlpy.dataframe_to_xarray(
df=measurements,
Expand Down
Loading