bench_python_c/cython_impl.pyx at main · nasirus/bench_python_c · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# cython: language_level=3
# cython: boundscheck=False
# cython: wraparound=False
# cython: cdivision=True

"""Cython implementation for DataFrame generation and parquet writing."""
import pandas as pd
import numpy as np
cimport numpy as cnp
from libc.math cimport sqrt


def generate_dataframe_cython(int num_rows=1_000_000):
    """
    Generate a pandas DataFrame with random data using Cython optimizations.

    Args:
        num_rows: Number of rows to generate (default: 1 million)

    Returns:
        pandas DataFrame with multiple columns of different types
    """
    cdef int i
    cdef cnp.ndarray[cnp.int64_t, ndim=1] ids
    cdef cnp.ndarray[cnp.float64_t, ndim=1] values1
    cdef cnp.ndarray[cnp.float64_t, ndim=1] values2
    cdef cnp.ndarray[cnp.uint8_t, ndim=1, cast=True] flags

    # Pre-allocate arrays
    ids = np.arange(num_rows, dtype=np.int64)
    values1 = np.empty(num_rows, dtype=np.float64)
    values2 = np.empty(num_rows, dtype=np.float64)
    flags = np.empty(num_rows, dtype=np.uint8)

    # Fill arrays with computed values
    for i in range(num_rows):
        values1[i] = <double>i * 2.5
        values2[i] = sqrt(<double>i)
        flags[i] = <unsigned char>(i % 2)

    # Create category array
    categories = [f'cat_{i % 10}' for i in range(num_rows)]

    data = {
        'id': ids,
        'value1': values1,
        'value2': values2,
        'category': categories,
        'flag': flags.astype(bool),
    }

    return pd.DataFrame(data)


def write_parquet_cython(df, filename: str):
    """
    Write DataFrame to parquet file with snappy compression.

    Args:
        df: DataFrame to write
        filename: Output parquet file path
    """
    df.to_parquet(filename, compression='snappy', engine='pyarrow', index=False)


def run_benchmark_cython(int num_rows=1_000_000, str output_file='output_cython.parquet'):
    """
    Run the complete benchmark: generate DataFrame and write to parquet.

    Args:
        num_rows: Number of rows to generate
        output_file: Output parquet file path

    Returns:
        Tuple of (generation_time, write_time) in seconds
    """
    import time

    # Time DataFrame generation
    start = time.perf_counter()
    df = generate_dataframe_cython(num_rows)
    gen_time = time.perf_counter() - start

    # Time parquet writing
    start = time.perf_counter()
    write_parquet_cython(df, output_file)
    write_time = time.perf_counter() - start

    return gen_time, write_time