Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions benchmarks/benchmark_read_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import argparse
import os
import time
from pathlib import Path
import pandas as pd
from generate_time_series import generate_time_series # pylint: disable=import-error

def char_range(n):
'''Generate a range of characters'''
return [chr(ord('a') + i) for i in range(n)]

def generate_benchmark_data():
'''Generate varying sized csv files for benchmarking purposes'''
data_dir = Path(__file__).parent / 'benchmark-data'
os.makedirs(data_dir, exist_ok=True)
print('Generating small', flush=True)
small = generate_time_series('2000-01-01', '2020-01-01', 200, char_range(2))
small.to_csv(data_dir / 'small.csv')

print('Generating medium', flush=True)
medium = generate_time_series('2000-01-01', '2020-01-01', 2000, char_range(5))
medium.to_csv(data_dir / 'medium.csv')

print('Generating large', flush=True)
large = generate_time_series('2000-01-01', '2020-01-01', 20000, char_range(20))
large.to_csv(data_dir / 'large.csv')

print('Generating very large', flush=True)
very_large = generate_time_series('2000-01-01', '2020-01-01', 200000, char_range(50))
very_large.to_csv(data_dir / 'very-large.csv')

def benchmark_read_csv():
'''Compare pandas.read_csv using C engine with sep=',' and python engine with sep=None'''
in_dir = Path(__file__).parent / 'benchmark-data'
names = ['small', 'medium', 'large', 'very-large']
data_files = [ in_dir / f'{name}.csv' for name in names ]

c_timings = []
for data_file in data_files:
print(f'Start C {data_file}', flush=True)
start = time.perf_counter()
df = pd.read_csv(data_file, sep=',', engine='c') # pylint: disable=unused-variable
elapsed = time.perf_counter() - start
c_timings.append(elapsed)

py_timings = []
for data_file in data_files:
print(f'Start python {data_file}', flush=True)
start = time.perf_counter()
df = pd.read_csv(data_file, sep=None, engine='python') # pylint: disable=unused-variable
elapsed = time.perf_counter() - start
py_timings.append(elapsed)


results = pd.DataFrame({'name' : names, 'C' : c_timings, 'python' : py_timings})
results['python/C'] = results['python'] / results['C']
print(results)

def check_data_exists():
'''Check that all benchmarking data exists'''
in_dir = Path(__file__).parent / 'benchmark-data'
names = ['small', 'medium', 'large', 'very-large']
data_files = [ in_dir / f'{name}.csv' for name in names ]
for data_file in data_files:
if not data_file.exists():
return False
return True

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--regenerate', action='store_true', help='If set regenerate benchmark data'
)
args = parser.parse_args()
if args.regenerate or not check_data_exists():
print('Generating data')
generate_benchmark_data()
print('Running benchmarks')
benchmark_read_csv()
25 changes: 25 additions & 0 deletions benchmarks/generate_time_series.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd
import numpy as np

def generate_time_series(start_date, end_date, steps, columns):
'''
Generate a time series. Data will be either stationary, cyclic or increasing with standard
normal noise
'''
rng = np.random.default_rng()
data = {
'time' : pd.date_range(start_date, end_date, steps)
}

for col in columns:
t = rng.random()
if t < 1/3:
# Stationary
data[col] = rng.normal(size=steps)
elif t < 2/3:
# cyclic
data[col] = 2 * np.sin(np.linspace(0, 4*np.pi, steps)) + rng.normal(size=steps)
else:
# Increasing
data[col] = np.linspace(0, 4, steps) + rng.normal(size=steps)
return pd.DataFrame(data)
10 changes: 9 additions & 1 deletion daisypy/optim/scalar_objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,15 @@ def __init__(self, name, data_extractor, target, target_name, loss_fn):
self.name = name
self.data_extractor = data_extractor
if not isinstance(target, pd.DataFrame):
target = pd.read_csv(target)
target = pd.read_csv(target, sep=None, engine='python')
if not "time" in target.columns:
raise ValueError(
f'target must contain "time" column. Got columns {list(target.columns)}'
)
if not target_name in target.columns:
raise ValueError(
f'target must contain "{target_name}" column. Got columns {list(target.columns)}'
)
self.target = target[["time", target_name]].rename(columns={target_name : 'value'})
self.target["time"] = pd.to_datetime(self.target["time"])
self.loss_fn = LossWrapper(loss_fn) # Wrap it so target and actual are processed correctly
Expand Down
16 changes: 16 additions & 0 deletions tests/mockup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,19 @@ def __init__(self, parameters, objective_fn):
def __call__(self, parameter_values):
named_parameters = { p.name : value for p, value in zip(self.parameters, parameter_values) }
return { 'mock' : self.objective_fn(**named_parameters) }

class MockDataExtractor:
'''Mock data extractor returning data it was constructed with'''
def __init__(self, data):
self.data = data

def __call__(self, daisy_output_directory):
return self.data

class MockLoss:
'''Mock loss that always returns a specificed value'''
def __init__(self, value):
self.value = value

def __call__(self, actial, target):
return self.value
25 changes: 25 additions & 0 deletions tests/test-data/targets/comma-separated.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
time,NO3
1000-01-01 01:00:00,0.00003816
1000-01-01 02:00:00,0.0000343443
1000-01-01 03:00:00,0.0000309103
1000-01-01 04:00:00,0.0000278197
1000-01-01 05:00:00,0.0000250382
1000-01-01 06:00:00,0.0000225348
1000-01-01 07:00:00,0.0000202819
1000-01-01 08:00:00,0.0000182542
1000-01-01 09:00:00,0.0000164294
1000-01-01 10:00:00,0.0000147871
1000-01-01 11:00:00,0.0000133091
1000-01-01 12:00:00,0.0000119789
1000-01-01 13:00:00,0.0000107818
1000-01-01 14:00:00,0.00000970454
1000-01-01 15:00:00,0.00000873506
1000-01-01 16:00:00,0.00000786261
1000-01-01 17:00:00,0.0000070775
1000-01-01 18:00:00,0.00000637099
1000-01-01 19:00:00,0.00000573523
1000-01-01 20:00:00,0.00000516315
1000-01-01 21:00:00,0.00000464837
1000-01-01 22:00:00,0.00000418517
1000-01-01 23:00:00,0.00000376839
1000-01-02 00:00:00,0.0000033934
25 changes: 25 additions & 0 deletions tests/test-data/targets/semi-colon-separated.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
time;NO3
1000-01-01 01:00:00;0.00003816
1000-01-01 02:00:00;0.0000343443
1000-01-01 03:00:00;0.0000309103
1000-01-01 04:00:00;0.0000278197
1000-01-01 05:00:00;0.0000250382
1000-01-01 06:00:00;0.0000225348
1000-01-01 07:00:00;0.0000202819
1000-01-01 08:00:00;0.0000182542
1000-01-01 09:00:00;0.0000164294
1000-01-01 10:00:00;0.0000147871
1000-01-01 11:00:00;0.0000133091
1000-01-01 12:00:00;0.0000119789
1000-01-01 13:00:00;0.0000107818
1000-01-01 14:00:00;0.00000970454
1000-01-01 15:00:00;0.00000873506
1000-01-01 16:00:00;0.00000786261
1000-01-01 17:00:00;0.0000070775
1000-01-01 18:00:00;0.00000637099
1000-01-01 19:00:00;0.00000573523
1000-01-01 20:00:00;0.00000516315
1000-01-01 21:00:00;0.00000464837
1000-01-01 22:00:00;0.00000418517
1000-01-01 23:00:00;0.00000376839
1000-01-02 00:00:00;0.0000033934
25 changes: 25 additions & 0 deletions tests/test-data/targets/tab-seperated.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
time NO3
1000-01-01 01:00:00 0.00003816
1000-01-01 02:00:00 0.0000343443
1000-01-01 03:00:00 0.0000309103
1000-01-01 04:00:00 0.0000278197
1000-01-01 05:00:00 0.0000250382
1000-01-01 06:00:00 0.0000225348
1000-01-01 07:00:00 0.0000202819
1000-01-01 08:00:00 0.0000182542
1000-01-01 09:00:00 0.0000164294
1000-01-01 10:00:00 0.0000147871
1000-01-01 11:00:00 0.0000133091
1000-01-01 12:00:00 0.0000119789
1000-01-01 13:00:00 0.0000107818
1000-01-01 14:00:00 0.00000970454
1000-01-01 15:00:00 0.00000873506
1000-01-01 16:00:00 0.00000786261
1000-01-01 17:00:00 0.0000070775
1000-01-01 18:00:00 0.00000637099
1000-01-01 19:00:00 0.00000573523
1000-01-01 20:00:00 0.00000516315
1000-01-01 21:00:00 0.00000464837
1000-01-01 22:00:00 0.00000418517
1000-01-01 23:00:00 0.00000376839
1000-01-02 00:00:00 0.0000033934
18 changes: 18 additions & 0 deletions tests/test_scalar_objective.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# pylint: disable=missing-function-docstring
from pathlib import Path
import pandas as pd
from daisypy.optim import ScalarObjective
from daisypy.optim.loss_fns import mse
from .mockup import MockDataExtractor

def test_csv_delimiter():
in_dir = Path(__file__).parent / 'test-data' / 'targets'
expected = pd.read_csv(in_dir / 'comma-separated.csv').rename(columns={"NO3" : "value"})
expected["time"] = pd.to_datetime(expected["time"])
data_extractor = MockDataExtractor(expected)
for target_file in in_dir.iterdir():
if target_file.name.endswith('separated.csv'):
# We use the same extracted data, but change the target each time.
f = ScalarObjective(target_file.name, data_extractor, target_file, "NO3", mse)
result = f(in_dir).pop(target_file.name)
assert result == 0, target_file
Loading