Release v0.2.2: Add batch_size, update metadata & docs

rannd1nt · rannd1nt · commit 9dbb855a3d29 · 2025-12-30T19:39:29.000+07:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,8 +1,8 @@
 [package]
 name = "phaeton"
-version = "0.1.2"
+version = "0.2.2"
 edition = "2021"
-authors = ["Your Name <zahraandzakiits@gmail.com>"]
+authors = ["Zahraan Dzakii Tsaqiif <zahraandzakiits@gmail.com>"]
 description = "A high-performance Python library for preprocessing and sanitizing raw data streams, accelerated by Rust."
 license = "MIT"
 
diff --git a/README.md b/README.md
@@ -5,114 +5,89 @@
 [![Rust](https://img.shields.io/badge/built%20with-Rust-orange)](https://www.rust-lang.org/)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
-> ⚠️ **Project Status:** Phaeton is currently in **Experimental Beta (v0.2.0)**.
+> ⚠️ **Project Status:** Phaeton is currently in **Experimental Beta (v0.2.1)**.
 > The core streaming engine is functional, but the library is currently under limited maintenance due to the author's personal schedule.
 
 
 **Phaeton** is a specialized, Rust-powered preprocessing engine designed to sanitize raw data streams before they reach your analytical environment.
 
-It acts as the strictly typed **"Gatekeeper"** of your data pipeline. Unlike traditional DataFrame libraries that load entire datasets into RAM, Phaeton employs a **zero-copy streaming architecture**. It processes data chunk-by-chunk—filtering noise, fixing encodings, and standardizing formats ensuring **O(1) memory complexity**.
+It acts as the strictly typed **"Gatekeeper"** of your data pipeline. Unlike traditional DataFrame libraries that attempt to load entire datasets into RAM, Phaeton employs a **zero-copy streaming architecture**. It processes data chunk-by-chunk filtering noise, fixing encodings, and standardizing formats ensuring **O(1) memory complexity** relative to file size.
 
-This allows you to process massive datasets (GBs/TBs) on standard hardware without memory spikes, delivering clean, high-quality data to downstream tools like Pandas, Polars, or ML models.
+This allows you to process massive datasets on standard hardware without memory spikes, delivering clean, high-quality data to downstream tools like Pandas, Polars, or ML models.
 
 > **The Philosophy:** Don't waste memory loading garbage. Clean the stream first, then analyze the gold.
 
 ---
 
-## 🚀 Key Features
+## Key Features
 
-* **Streaming Architecture:** Processes files chunk-by-chunk. Memory usage remains flat and low regardless of file size.
-* **Parallel Execution:** Utilizes all CPU cores via Rayon (Rust) for heavy lifting (Regex, Fuzzy Matching).
-* **Strict Quarantine:** Bad data isn't just dropped; it's quarantined into a separate file with a generated `_phaeton_reason` column for auditing.
-* **Smart Casting:** Automatically handles messy currency formats (e.g., `"$ 5.000,00"` → `5000.0` float) without manual string parsing.
-* **Zero-Copy Logic:** Built on Rust's `Cow<str>` to minimize memory allocation during processing.
+* **Streaming Architecture:** Processes files chunk-by-chunk. Memory usage remains stable regardless of whether the file is 100MB or 100GB.
+* **Parallel Execution:** Utilizes all CPU cores via **Rust Rayon** to handle heavy lifting (Regex, Fuzzy Matching) without blocking Python.
+* **Strict Quarantine:** Bad data isn't just dropped silently; it's quarantined into a separate file with a generated `_phaeton_reason` column for auditing.
+* **Smart Casting:** Automatically handles messy formats (e.g., `"Rp 5.250.000,00"` → `5250000` int) without complex manual parsing.
+* **Configurable Engine:** Full control over `batch_size` and worker threads to tune performance for low-memory devices or high-end servers.
 
 ---
 
-## 📦 Installation
+##  Performance Benchmark
 
-```bash
-pip install phaeton
-```
+Phaeton is optimized for "Dirty Data" scenarios involving heavy string parsing, regex filtering, and fuzzy matching.
 
-## ⚡ Key Features
 
-**1. The Scenario**
+**Test Scenario:**
+We generated a **Chaos Dataset** containing **1 Million Rows** of mixed dirty data:
+* **Operations:** Trim whitespace, Currency scrubbing (`$ 50.000,00` -> `50000`), Type casting, Fuzzy Alignment (Typo correction for City names), and Regex Filtering.
+* **Hardware:** Entry-level Laptop (Intel Core i3-1220P, 16GB RAM).
 
-You have a dirty CSV `(raw_data.csv)` with mixed encodings, typos in city names, and messy currency strings. You want a clean Parquet file for Pandas.
+**Results:**
 
-**2. The Code**
+| OS Environment | Speed (Rows/sec) | Duration (1M Rows) | Throughput |
+| :--- | :--- | :--- | :--- |
+| **Windows 11** | **~820,000 rows/s** | **1.21s** | **~70 MB/s** |
+| **Linux (Arch)** | ~575,000 rows/s | 1.73s | ~49 MB/s |
+
+> *Note: Phaeton maintains a low and predictable memory footprint (~10-20MB overhead) regardless of the input file size due to its streaming nature.*
+
+---
+##  Usage Example
 
 ```python
 import phaeton
 
-# 1. Probe the file (Auto-detect encoding, delimiter, headers, etc)
-info = phaeton.probe("raw_data.csv")
-print(f"Detected: {info['encoding']} with delimiter '{info['delimiter']}'")
-
-# 2. Initialize Engine (0 = Use all CPU cores)
-eng = phaeton.Engine(workers=0)
+# 1. Initialize Engine (Auto-detect cores)
+engine = phaeton.Engine()
 
-# 3. Build the Pipeline
+# 2. Define Pipeline
 pipeline = (
-    eng.ingest("raw_data.csv")
-       
-       # GATEKEEPING: Fix encoding & standardize headers
-       .decode(encoding=info['encoding'])
-       .headers(style="snake")
-       
-       # ELIMINATION: Remove useless rows
-       .prune(col="email") # Drop rows with empty email
-       .discard(col="status", match="BANNED", mode="exact")
-       
-       # TRANSFORMATION: Smart Cleaning
-       # "$ 30.000,00" -> 30000 (Integer)
-       # If it fails (e.g., "Free"), send row to Quarantine
-       .cast("salary", type="int", clean=True, on_error="quarantine")
-       
-       # FUZZY FIXING: Fix typos ("Cihcago" -> "Chicago")
-       .fuzzyalign(
-           col="city", 
-           ref=["Chicago", "Jakarta", "Shanghai"], 
-           threshold=0.85
-        )
-       
-       # OUTPUT: Split into Clean Data & Audit Log
-       .quarantine("bad_data_audit.csv")
-       .dump("clean_data.parquet")
+    engine.ingest("dirty_data.csv")
+    .prune(col="email")                                     # Drop rows if email is empty
+    .discard("status", "BANNED", mode="exact")              # Filter specific values
+    .scrub("username", "trim")                              # Clean whitespace
+    .scrub("salary", "currency")                            # Parse "Rp 5.000" to number
+    .cast("salary", "int", clean=True)                      # Safely cast to Integer
+    .fuzzyalign("city", ref=["Jakarta", "Bandung"], threshold=0.85) # Fix typos
+    .quarantine("quarantine.csv")                           # Save bad data here
+    .dump("clean_data.csv")                                 # Save good data here
 )
 
-# 4. Execute (Rust takes over)
-stats = eng.exec([pipeline])
-
-print(f"Processed: {stats.processed} rows")
-print(f"Saved: {stats.saved} | Quarantined: {stats.quarantined}")
+# 3. Execute
+stats = engine.exec(pipeline)
+print(f"Processed: {stats.processed}, Saved: {stats.saved}")
 ```
 
-<br>
-
-## 📊 Performance Benchmark
-
-Phaeton is optimized for "Dirty Data" scenarios (String parsing, Regex filtering, Fuzzy matching).
-
-**Test Environment:**
-- **Dataset:** 1 Million Rows (Mixed dirty data: Typos, Currency strings, Encoding issues).
-- **Hardware:** Entry Level Laptop.
-
-**Result:**
-| Metric | Phaeton |
-| :---: | :---: |
-| Speed | ~575,000 rows/sec |
-| Memory Usage | ~50MB (Constant) |
-| Strategy | Parallel Streaming |
+---
 
-<br>
+## Installation
 
-> Note: Phaeton maintains low memory footprint even when processing multi-gigabyte files due to its zero-copy streaming architecture.
+Phaeton provides **Universal Wheels (ABI3)**. No Rust compiler needed.
+```bash
+pip install phaeton
+```
+> **Supported:** Python 3.8+ on Windows, Linux, and macOS (Intel & Apple Silicon).
 
-<br>
+---
 
-## 📚 API Reference
+## API Reference
 
 ### Root Module <br>
 | Method | Description 
@@ -155,24 +130,27 @@ Methods to save the final results or handle rejected data.
 
 ---
 
-## 🗺️ Roadmap
+## Roadmap
 
-Phaeton is currently in **Beta (v0.2.0)**. Here is the status of our development pipeline:
+Phaeton is currently in **Beta (v0.2.1)**. Here is the status of our development pipeline:
 
-| Feature | Status | Notes |
+| Feature | Status | Implementation Notes |
 | :--- | :---: | :--- |
-| **Parallel Streaming Engine** | ✅ Ready | Powered by Rayon |
-| **Smart Type Casting** | ✅ Ready | Auto-clean numeric strings |
-| **Quarantine Logic** | ✅ Ready | Audit logs for bad data |
-| **Fuzzy Alignment** | ✅ Ready | Jaro-Winkler / Levenshtein |
-| **SHA-256 Hashing** | 📝 Planned | Security for PII data |
-| **Column Splitting & Combining** | 📝 Planned | - |
-| **Imputation (`.fill()`)** | 📝 Planned | Mean/Median/Mode fill |
-| **Parquet/Arrow Integration** | 📝 Planned | Native output support |
+| **Parallel Streaming Engine** | ✅ Ready | Powered by Rust Rayon (Multi-core) |
+| **Regex & Filter Logic** | ✅ Ready | `keep`, `discard`, `prune` implemented |
+| **Smart Type Casting** | ✅ Ready | Auto-clean numeric strings (`"Rp 5,000"` -> `5000`) |
+| **Fuzzy Alignment** | ✅ Ready | Jaro-Winkler for typo correction |
+| **Quarantine System** | ✅ Ready | Full audit trail for rejected rows |
+| **Basic Text Scrubbing** | ✅ Ready | Trim, HTML strip, Case conversion |
+| **Header Normalization** | 🚧 In Progress | `snake_case`, `camelCase` conversions |
+| **Date Normalization** | 🚧 In Progress | Auto-detect & reformat dates |
+| **Deduplication** | 📝 Planned | Row-level & Column-level dedupe |
+| **Hashing & Anonymization** | 📝 Planned | SHA-256 for PII data |
+| **Parquet/Arrow Support** | 📝 Planned | Native output integration |
 
 ---
 
-## 🤝 Contributing
+## Contributing
 
 This project is built with **Maturin** (PyO3 + Rust). Interested in contributing?
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,10 +4,10 @@ build-backend = "maturin"
 
 [project]
 name = "phaeton"
-version = "0.2.1"
-description = "High-performance preprocessing and streaming data cleaning, powered by Rust."
+version = "0.2.2"
+description = "A high-performance Python library for preprocessing and sanitizing raw data streams, accelerated by Rust."
 readme = "README.md"
-license = {text = "MIT"}
+license = {file = "MIT"}
 authors = [
     {name = "Zahraan Dzakii Tsaqiif", email = "zahraandzakiits@gmail.com"}
 ]
@@ -25,13 +25,16 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Software Development :: Libraries :: Python Modules",
     "Topic :: Scientific/Engineering :: Information Analysis",
 ]
 
 [project.urls]
 Homepage = "https://github.com/rannd1nt/phaeton"
 Repository = "https://github.com/rannd1nt/phaeton"
 Issues = "https://github.com/rannd1nt/phaeton/issues"
+Documentation = "https://github.com/rannd1nt/phaeton#readme"
 
 [project.optional-dependencies]
 dev = [
diff --git a/python/phaeton/__init__.py b/python/phaeton/__init__.py
@@ -7,16 +7,16 @@
 try:
     from ._phaeton import __version__ as _rust_version
 except ImportError:
-    _rust_version = "0.2.0-alpha"
+    _rust_version = "0.2.2-beta"
 
 def version() -> str:
     """
     Returns the current version of the Phaeton library and the underlying Rust engine.
 
     Returns:
-        str: Version string (e.g., "Phaeton v0.2.0 (Engine: Rust v0.1.1)").
+        str: Version string (e.g., "Phaeton v1.1.0 (Phaeton Rust Core: v1.1.0)").
     """
-    return f"Phaeton v0.2.0 (Engine: Rust v{_rust_version})"
+    return f"Phaeton v0.2.2-beta (Phaeton Rust Core: v{_rust_version})"
 
 def probe(source: str) -> dict:
     """
diff --git a/python/phaeton/engine.py b/python/phaeton/engine.py
@@ -30,15 +30,19 @@ class Engine:
     of one or multiple pipelines simultaneously.
     """
     
-    def __init__(self, workers: int = 0):
+    def __init__(self, workers: int = 0, batch_size: int = 10000):
         """
         Initialize the Engine.
 
         Args:
             workers (int, optional): Number of CPU threads to use. 
                 Set to 0 to automatically use all available cores. Defaults to 0.
+            batch_size (int, optional): Number of rows to process in each batch. 
+                Defaults to 10000.
+
+
         """
-        self.config = {"workers": workers}
+        self.config = {"workers": workers, "batch_size": batch_size}
 
     def ingest(self, source: str) -> Pipeline:
         """
diff --git a/src/engine.rs b/src/engine.rs
@@ -6,10 +6,11 @@ use std::time::Instant;
 
 pub struct Engine {
     workers: usize,
+    batch_size: usize,
 }
 
 impl Engine {
-    pub fn new(workers: usize) -> Self {
+    pub fn new(workers: usize, batch_size: usize) -> Self {
         let actual_workers = if workers == 0 {
             // Auto-detect CPU cores
             std::thread::available_parallelism()
@@ -25,9 +26,12 @@ impl Engine {
             .build_global()
             .ok(); // Ignore if already built
         
-        Self { workers: actual_workers }
+        Self { 
+            workers: actual_workers,
+            batch_size: if batch_size == 0 { 10_000 } else { batch_size } 
+        }
     }
-    
+
     /// Execute single pipeline (non-parallel)
     pub fn execute_single(&self, payload: HashMap<String, serde_json::Value>) -> Result<HashMap<String, u64>> {
         let start = Instant::now();
@@ -48,7 +52,7 @@ impl Engine {
         let quarantine = payload.get("quarantine")
             .and_then(|v| v.as_str());
         
-        let processor = StreamProcessor::new(source, steps, 0);
+        let processor = StreamProcessor::new(source, steps, 0, self.batch_size);
         let stats = processor.execute(output, quarantine)?;
         
         let mut result = HashMap::new();
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,6 @@
 use pyo3::prelude::*;
 use std::collections::HashMap;
-use pythonize::depythonize; // <--- Import penerjemah
+use pythonize::depythonize;
 use serde_json::Value;
 
 mod engine;
@@ -26,7 +26,7 @@ fn preview_pipeline(_py: Python, source: String, steps_py: PyObject, n: usize) -
     let steps: Vec<HashMap<String, Value>> = depythonize(steps_py.as_ref(_py))
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Invalid steps format: {}", e)))?;
 
-    let processor = StreamProcessor::new(source, steps, n);
+    let processor = StreamProcessor::new(source, steps, n, 10000);
     let preview = processor.peek()
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
     
@@ -39,7 +39,7 @@ fn execute_pipeline(_py: Python, payload_py: PyObject) -> PyResult<HashMap<Strin
     let payload: HashMap<String, Value> = depythonize(payload_py.as_ref(_py))
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Invalid payload format: {}", e)))?;
 
-    let engine = Engine::new(0); 
+    let engine = Engine::new(0, 10000); 
     let stats = engine.execute_single(payload)
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
     
@@ -62,11 +62,17 @@ fn execute_batch(
     let config: HashMap<String, Value> = depythonize(config_py.as_ref(_py))
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("Invalid config: {}", e)))?;
 
+    // Get number of workers
     let workers = config.get("workers")
         .and_then(|v| v.as_u64())
         .unwrap_or(0) as usize;
+
+    // Get batch size
+    let batch_size = config.get("batch_size")
+        .and_then(|v| v.as_u64())
+        .unwrap_or(10_000) as usize;
     
-    let engine = Engine::new(workers);
+    let engine = Engine::new(workers, batch_size);
     let results = engine.execute_parallel(payloads)
         .map_err(|e| PyErr::new::<pyo3::exceptions::PyRuntimeError, _>(e.to_string()))?;
     
diff --git a/src/streaming/mod.rs b/src/streaming/mod.rs