-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcython_impl.pyx
More file actions
89 lines (69 loc) · 2.49 KB
/
Copy pathcython_impl.pyx
File metadata and controls
89 lines (69 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True
"""Cython implementation for DataFrame generation and parquet writing."""
import pandas as pd
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt
def generate_dataframe_cython(int num_rows=1_000_000):
"""
Generate a pandas DataFrame with random data using Cython optimizations.
Args:
num_rows: Number of rows to generate (default: 1 million)
Returns:
pandas DataFrame with multiple columns of different types
"""
cdef int i
cdef cnp.ndarray[cnp.int64_t, ndim=1] ids
cdef cnp.ndarray[cnp.float64_t, ndim=1] values1
cdef cnp.ndarray[cnp.float64_t, ndim=1] values2
cdef cnp.ndarray[cnp.uint8_t, ndim=1, cast=True] flags
# Pre-allocate arrays
ids = np.arange(num_rows, dtype=np.int64)
values1 = np.empty(num_rows, dtype=np.float64)
values2 = np.empty(num_rows, dtype=np.float64)
flags = np.empty(num_rows, dtype=np.uint8)
# Fill arrays with computed values
for i in range(num_rows):
values1[i] = <double>i * 2.5
values2[i] = sqrt(<double>i)
flags[i] = <unsigned char>(i % 2)
# Create category array
categories = [f'cat_{i % 10}' for i in range(num_rows)]
data = {
'id': ids,
'value1': values1,
'value2': values2,
'category': categories,
'flag': flags.astype(bool),
}
return pd.DataFrame(data)
def write_parquet_cython(df, filename: str):
"""
Write DataFrame to parquet file with snappy compression.
Args:
df: DataFrame to write
filename: Output parquet file path
"""
df.to_parquet(filename, compression='snappy', engine='pyarrow', index=False)
def run_benchmark_cython(int num_rows=1_000_000, str output_file='output_cython.parquet'):
"""
Run the complete benchmark: generate DataFrame and write to parquet.
Args:
num_rows: Number of rows to generate
output_file: Output parquet file path
Returns:
Tuple of (generation_time, write_time) in seconds
"""
import time
# Time DataFrame generation
start = time.perf_counter()
df = generate_dataframe_cython(num_rows)
gen_time = time.perf_counter() - start
# Time parquet writing
start = time.perf_counter()
write_parquet_cython(df, output_file)
write_time = time.perf_counter() - start
return gen_time, write_time