From 56d61adc81be1627888ef81f74531dfee8c594f7 Mon Sep 17 00:00:00 2001 From: Silas Date: Thu, 19 Mar 2026 08:53:32 +0100 Subject: [PATCH 1/5] Test that we can handle target csv with nonstandard delimiters --- tests/mockup.py | 16 ++++++++++++ tests/test-data/targets/comma-separated.csv | 25 +++++++++++++++++++ .../targets/semi-colon-separated.csv | 25 +++++++++++++++++++ tests/test-data/targets/tab-seperated.csv | 25 +++++++++++++++++++ tests/test_scalar_objective.py | 18 +++++++++++++ 5 files changed, 109 insertions(+) create mode 100644 tests/test-data/targets/comma-separated.csv create mode 100644 tests/test-data/targets/semi-colon-separated.csv create mode 100644 tests/test-data/targets/tab-seperated.csv create mode 100644 tests/test_scalar_objective.py diff --git a/tests/mockup.py b/tests/mockup.py index 56e78cf..71d3fc8 100644 --- a/tests/mockup.py +++ b/tests/mockup.py @@ -39,3 +39,19 @@ def __init__(self, parameters, objective_fn): def __call__(self, parameter_values): named_parameters = { p.name : value for p, value in zip(self.parameters, parameter_values) } return { 'mock' : self.objective_fn(**named_parameters) } + +class MockDataExtractor: + '''Mock data extractor returning data it was constructed with''' + def __init__(self, data): + self.data = data + + def __call__(self, daisy_output_directory): + return self.data + +class MockLoss: + '''Mock loss that always returns a specificed value''' + def __init__(self, value): + self.value = value + + def __call__(self, actial, target): + return self.value diff --git a/tests/test-data/targets/comma-separated.csv b/tests/test-data/targets/comma-separated.csv new file mode 100644 index 0000000..ab7cd0c --- /dev/null +++ b/tests/test-data/targets/comma-separated.csv @@ -0,0 +1,25 @@ +time,NO3 +1000-01-01 01:00:00,0.00003816 +1000-01-01 02:00:00,0.0000343443 +1000-01-01 03:00:00,0.0000309103 +1000-01-01 04:00:00,0.0000278197 +1000-01-01 05:00:00,0.0000250382 +1000-01-01 06:00:00,0.0000225348 +1000-01-01 07:00:00,0.0000202819 +1000-01-01 08:00:00,0.0000182542 +1000-01-01 09:00:00,0.0000164294 +1000-01-01 10:00:00,0.0000147871 +1000-01-01 11:00:00,0.0000133091 +1000-01-01 12:00:00,0.0000119789 +1000-01-01 13:00:00,0.0000107818 +1000-01-01 14:00:00,0.00000970454 +1000-01-01 15:00:00,0.00000873506 +1000-01-01 16:00:00,0.00000786261 +1000-01-01 17:00:00,0.0000070775 +1000-01-01 18:00:00,0.00000637099 +1000-01-01 19:00:00,0.00000573523 +1000-01-01 20:00:00,0.00000516315 +1000-01-01 21:00:00,0.00000464837 +1000-01-01 22:00:00,0.00000418517 +1000-01-01 23:00:00,0.00000376839 +1000-01-02 00:00:00,0.0000033934 \ No newline at end of file diff --git a/tests/test-data/targets/semi-colon-separated.csv b/tests/test-data/targets/semi-colon-separated.csv new file mode 100644 index 0000000..75294dd --- /dev/null +++ b/tests/test-data/targets/semi-colon-separated.csv @@ -0,0 +1,25 @@ +time;NO3 +1000-01-01 01:00:00;0.00003816 +1000-01-01 02:00:00;0.0000343443 +1000-01-01 03:00:00;0.0000309103 +1000-01-01 04:00:00;0.0000278197 +1000-01-01 05:00:00;0.0000250382 +1000-01-01 06:00:00;0.0000225348 +1000-01-01 07:00:00;0.0000202819 +1000-01-01 08:00:00;0.0000182542 +1000-01-01 09:00:00;0.0000164294 +1000-01-01 10:00:00;0.0000147871 +1000-01-01 11:00:00;0.0000133091 +1000-01-01 12:00:00;0.0000119789 +1000-01-01 13:00:00;0.0000107818 +1000-01-01 14:00:00;0.00000970454 +1000-01-01 15:00:00;0.00000873506 +1000-01-01 16:00:00;0.00000786261 +1000-01-01 17:00:00;0.0000070775 +1000-01-01 18:00:00;0.00000637099 +1000-01-01 19:00:00;0.00000573523 +1000-01-01 20:00:00;0.00000516315 +1000-01-01 21:00:00;0.00000464837 +1000-01-01 22:00:00;0.00000418517 +1000-01-01 23:00:00;0.00000376839 +1000-01-02 00:00:00;0.0000033934 \ No newline at end of file diff --git a/tests/test-data/targets/tab-seperated.csv b/tests/test-data/targets/tab-seperated.csv new file mode 100644 index 0000000..08655f9 --- /dev/null +++ b/tests/test-data/targets/tab-seperated.csv @@ -0,0 +1,25 @@ +time NO3 +1000-01-01 01:00:00 0.00003816 +1000-01-01 02:00:00 0.0000343443 +1000-01-01 03:00:00 0.0000309103 +1000-01-01 04:00:00 0.0000278197 +1000-01-01 05:00:00 0.0000250382 +1000-01-01 06:00:00 0.0000225348 +1000-01-01 07:00:00 0.0000202819 +1000-01-01 08:00:00 0.0000182542 +1000-01-01 09:00:00 0.0000164294 +1000-01-01 10:00:00 0.0000147871 +1000-01-01 11:00:00 0.0000133091 +1000-01-01 12:00:00 0.0000119789 +1000-01-01 13:00:00 0.0000107818 +1000-01-01 14:00:00 0.00000970454 +1000-01-01 15:00:00 0.00000873506 +1000-01-01 16:00:00 0.00000786261 +1000-01-01 17:00:00 0.0000070775 +1000-01-01 18:00:00 0.00000637099 +1000-01-01 19:00:00 0.00000573523 +1000-01-01 20:00:00 0.00000516315 +1000-01-01 21:00:00 0.00000464837 +1000-01-01 22:00:00 0.00000418517 +1000-01-01 23:00:00 0.00000376839 +1000-01-02 00:00:00 0.0000033934 \ No newline at end of file diff --git a/tests/test_scalar_objective.py b/tests/test_scalar_objective.py new file mode 100644 index 0000000..239515d --- /dev/null +++ b/tests/test_scalar_objective.py @@ -0,0 +1,18 @@ +# pylint: disable=missing-function-docstring +from pathlib import Path +import pandas as pd +from daisypy.optim import ScalarObjective, DlfDataExtractor +from daisypy.optim.loss_fns import mse +from .mockup import MockDataExtractor, MockLoss + +def test_csv_delimiter(): + in_dir = Path(__file__).parent / 'test-data' / 'targets' + expected = pd.read_csv(in_dir / 'comma-separated.csv').rename(columns={"NO3" : "value"}) + expected["time"] = pd.to_datetime(expected["time"]) + data_extractor = MockDataExtractor(expected) + for target_file in in_dir.iterdir(): + if target_file.name.endswith('separated.csv'): + # We use the same extracted data, but change the target each time. + f = ScalarObjective(target_file.name, data_extractor, target_file, "NO3", mse) + result = f(in_dir).pop(target_file.name) + assert result == 0, target_file From ae7923f842f53c73f8c7bfbf363f26f638797cc2 Mon Sep 17 00:00:00 2001 From: Silas Date: Thu, 19 Mar 2026 09:54:02 +0100 Subject: [PATCH 2/5] Benchmarking pandas.read_csv --- benchmarks/benchmark_read_csv.py | 63 ++++++++++++++++++++++++++++++ benchmarks/generate_time_series.py | 25 ++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 benchmarks/benchmark_read_csv.py create mode 100644 benchmarks/generate_time_series.py diff --git a/benchmarks/benchmark_read_csv.py b/benchmarks/benchmark_read_csv.py new file mode 100644 index 0000000..4245263 --- /dev/null +++ b/benchmarks/benchmark_read_csv.py @@ -0,0 +1,63 @@ +import os +import time +from pathlib import Path +import pandas as pd +from generate_time_series import generate_time_series # pylint: disable=import-error + +def char_range(n): + '''Generate a range of characters''' + return [chr(ord('a') + i) for i in range(n)] + +def generate_benchmark_data(): + '''Generate varying sized csv files for benchmarking purposes''' + data_dir = Path(__file__).parent / 'benchmark-data' + os.makedirs(data_dir, exist_ok=True) + print('Generating small', flush=True) + small = generate_time_series('2000-01-01', '2020-01-01', 200, char_range(2)) + small.to_csv(data_dir / 'small.csv') + + print('Generating medium', flush=True) + medium = generate_time_series('2000-01-01', '2020-01-01', 2000, char_range(5)) + medium.to_csv(data_dir / 'medium.csv') + + print('Generating large', flush=True) + large = generate_time_series('2000-01-01', '2020-01-01', 20000, char_range(20)) + large.to_csv(data_dir / 'large.csv') + + print('Generating very large', flush=True) + very_large = generate_time_series('2000-01-01', '2020-01-01', 200000, char_range(50)) + very_large.to_csv(data_dir / 'very-large.csv') + +def benchmark_read_csv(): + '''Compare pandas.read_csv using C engine with sep=',' and python engine with sep=None''' + in_dir = Path(__file__).parent / 'benchmark-data' + names = ['small', 'medium', 'large', 'very-large'] + data_files = [ in_dir / f'{name}.csv' for name in names ] + + c_timings = [] + for data_file in data_files: + print(f'Start C {data_file}', flush=True) + start = time.perf_counter() + df = pd.read_csv(data_file, sep=',', engine='c') # pylint: disable=unused-variable + elapsed = time.perf_counter() - start + c_timings.append(elapsed) + + py_timings = [] + for data_file in data_files: + print(f'Start python {data_file}', flush=True) + start = time.perf_counter() + df = pd.read_csv(data_file, sep=None, engine='python') # pylint: disable=unused-variable + elapsed = time.perf_counter() - start + py_timings.append(elapsed) + + + results = pd.DataFrame({'name' : names, 'C' : c_timings, 'python' : py_timings}) + results['python/C'] = results['python'] / results['C'] + print(results) + + +if __name__ == '__main__': + #print('Generating data') + #generate_benchmark_data() + print('Running benchmarks') + benchmark_read_csv() diff --git a/benchmarks/generate_time_series.py b/benchmarks/generate_time_series.py new file mode 100644 index 0000000..dacce10 --- /dev/null +++ b/benchmarks/generate_time_series.py @@ -0,0 +1,25 @@ +import pandas as pd +import numpy as np + +def generate_time_series(start_date, end_date, steps, columns): + ''' + Generate a time series. Data will be either stationary, cyclic or increasing with standard + normal noise + ''' + rng = np.random.default_rng() + data = { + 'time' : pd.date_range(start_date, end_date, steps) + } + + for col in columns: + t = rng.random() + if t < 1/3: + # Stationary + data[col] = rng.normal(size=steps) + elif t < 2/3: + # cyclic + data[col] = 2 * np.sin(np.linspace(0, 4*np.pi, steps)) + rng.normal(size=steps) + else: + # Increasing + data[col] = np.linspace(0, 4, steps) + rng.normal(size=steps) + return pd.DataFrame(data) From cf1631ad69691a574882cbd4bf235fbfc466b402 Mon Sep 17 00:00:00 2001 From: Silas Date: Thu, 19 Mar 2026 09:54:32 +0100 Subject: [PATCH 3/5] Lint tests --- tests/mockup.py | 2 +- tests/test_scalar_objective.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/mockup.py b/tests/mockup.py index 71d3fc8..ff7248e 100644 --- a/tests/mockup.py +++ b/tests/mockup.py @@ -44,7 +44,7 @@ class MockDataExtractor: '''Mock data extractor returning data it was constructed with''' def __init__(self, data): self.data = data - + def __call__(self, daisy_output_directory): return self.data diff --git a/tests/test_scalar_objective.py b/tests/test_scalar_objective.py index 239515d..b82a885 100644 --- a/tests/test_scalar_objective.py +++ b/tests/test_scalar_objective.py @@ -1,9 +1,9 @@ # pylint: disable=missing-function-docstring from pathlib import Path import pandas as pd -from daisypy.optim import ScalarObjective, DlfDataExtractor +from daisypy.optim import ScalarObjective from daisypy.optim.loss_fns import mse -from .mockup import MockDataExtractor, MockLoss +from .mockup import MockDataExtractor def test_csv_delimiter(): in_dir = Path(__file__).parent / 'test-data' / 'targets' From 6ecb457eae8edcda3c8ab40bd8bac42abc6ace92 Mon Sep 17 00:00:00 2001 From: Silas Date: Thu, 19 Mar 2026 10:06:34 +0100 Subject: [PATCH 4/5] Make it possible to generate benchmark data --- benchmarks/benchmark_read_csv.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_read_csv.py b/benchmarks/benchmark_read_csv.py index 4245263..c97d3f6 100644 --- a/benchmarks/benchmark_read_csv.py +++ b/benchmarks/benchmark_read_csv.py @@ -1,3 +1,4 @@ +import argparse import os import time from pathlib import Path @@ -55,9 +56,24 @@ def benchmark_read_csv(): results['python/C'] = results['python'] / results['C'] print(results) +def check_data_exists(): + '''Check that all benchmarking data exists''' + in_dir = Path(__file__).parent / 'benchmark-data' + names = ['small', 'medium', 'large', 'very-large'] + data_files = [ in_dir / f'{name}.csv' for name in names ] + for data_file in data_files: + if not data_file.exists(): + return False + return True if __name__ == '__main__': - #print('Generating data') - #generate_benchmark_data() + parser = argparse.ArgumentParser() + parser.add_argument( + '--regenerate', action='store_true', help='If set regenerate benchmark data' + ) + args = parser.parse_args() + if args.regenerate or not check_data_exists(): + print('Generating data') + generate_benchmark_data() print('Running benchmarks') benchmark_read_csv() From 3b76c3ed440aeb4f1faad6ad8067e78b3562af3a Mon Sep 17 00:00:00 2001 From: Silas Date: Thu, 19 Mar 2026 10:43:32 +0100 Subject: [PATCH 5/5] Handle nonstandard delimiters in target csv --- daisypy/optim/scalar_objective.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/daisypy/optim/scalar_objective.py b/daisypy/optim/scalar_objective.py index fcb2d1b..0b4e93e 100644 --- a/daisypy/optim/scalar_objective.py +++ b/daisypy/optim/scalar_objective.py @@ -28,7 +28,15 @@ def __init__(self, name, data_extractor, target, target_name, loss_fn): self.name = name self.data_extractor = data_extractor if not isinstance(target, pd.DataFrame): - target = pd.read_csv(target) + target = pd.read_csv(target, sep=None, engine='python') + if not "time" in target.columns: + raise ValueError( + f'target must contain "time" column. Got columns {list(target.columns)}' + ) + if not target_name in target.columns: + raise ValueError( + f'target must contain "{target_name}" column. Got columns {list(target.columns)}' + ) self.target = target[["time", target_name]].rename(columns={target_name : 'value'}) self.target["time"] = pd.to_datetime(self.target["time"]) self.loss_fn = LossWrapper(loss_fn) # Wrap it so target and actual are processed correctly