diff --git a/benches/bigdecimal/aggregate.py b/benches/bigdecimal/aggregate.py index db0a4e9e..c1427788 100644 --- a/benches/bigdecimal/aggregate.py +++ b/benches/bigdecimal/aggregate.py @@ -328,7 +328,9 @@ def main() -> int: offset = now_local.strftime("%z") offset_str = f"UTC{offset[:3]}:{offset[3:]}" if len(offset) == 5 else "UTC" header_ts = f"{now_local.strftime('%Y-%m-%d %H:%M:%S')} ({offset_str})" - out_path = args.out or os.path.join(args.reports_dir, f"bigdecimal_report_{ts}.md") + out_path = args.out or os.path.join( + args.reports_dir, f"bigdecimal_report_utc_{ts}.md" + ) log_index = discover_logs(args.logs_dir) records: dict[str, dict[int, dict[str, dict[str, dict[str, str]]]]] = {} diff --git a/benches/bigint/.gitignore b/benches/bigint/.gitignore new file mode 100644 index 00000000..6abc98fe --- /dev/null +++ b/benches/bigint/.gitignore @@ -0,0 +1,11 @@ +# local +logs/ +reports/ +cases.cache/ + +# build artifacts +mojo/bench + +# Rust build output +rust/target/ +rust/Cargo.lock \ No newline at end of file diff --git a/benches/bigint/README.md b/benches/bigint/README.md new file mode 100644 index 00000000..201a38d2 --- /dev/null +++ b/benches/bigint/README.md @@ -0,0 +1,95 @@ +# BigInt cross-language benchmarks + +This harness benchmarks `decimo.BigInt` against two arbitrary-precision +integer implementations: + +| Lang | Library | Role | +| -------- | -------------------- | --------------------------------------------------------- | +| `mojo` | `decimo.BigInt` | System under test. | +| `python` | `int` (stdlib) | Correctness oracle (drives `match` flag) + timing column. | +| `rust` | `num-bigint::BigInt` | Static-compiled peer (timing column). | + +BigInt arithmetic is exact, so — unlike the BigDecimal harness — there is +**no precision parameter**. The report shows one timings table per op. + +## Layout + +```txt + cases/ # source-of-truth TOML test cases (one file per op) + mojo/ bench.mojo + bench (release-built binary) + python/ bench.py + rust/ Cargo.toml + src/main.rs (num-bigint harness) + aggregate.py # logs/*.csv -> reports/bigint_report_utc_.md + run_all.sh # build all available, run all ops, aggregate + logs/ # per-language CSV bench logs (generated) + reports/ # generated markdown reports +``` + +Log filenames embed the op so multiple runs can coexist: + +```txt + logs/___.csv +``` + +## Quick start + +```sh +./run_all.sh # all default ops +./run_all.sh --ops multiply power # subset of ops +``` + +Direct invocation of any single harness: + +```sh +# Mojo +cd mojo +pixi run --manifest-path ../../../pixi.toml ./bench \ + --op multiply --cases-dir ../cases --logs-dir ../logs + +# Python +cd python +pixi run --manifest-path ../../../pixi.toml python3 bench.py \ + --op multiply --cases-dir ../cases --logs-dir ../logs + +# Rust +cd rust +cargo run --release --quiet -- \ + --op multiply --cases-dir ../cases --logs-dir ../logs +``` + +## Ops + +`add`, `multiply`, `floor_divide` (`//`), `power`, `shift` (`<<`), `sqrt` +(integer square root), `from_string`, `to_string`. + +For `power` and `shift` the `b` field encodes a small integer (the exponent +or shift count); all other binary ops take `b` as the second operand. + +## Adding a new test case + +Edit the appropriate `cases/.toml` file: + +```toml +[config] +iterations = 500 # auto-tuner cap; actual count targets ~50ms + +[[cases]] +name = "Large integer multiply" +a = "{9,100}" # {C,N} repeats C, N times → 100 nines +b = "{9,100}" +``` + +## Report + +The aggregator emits a markdown report with three sections: + +1. **Cross-op overview** — one row per op showing median ns/iter per + language plus `dm/py` and `dm/rs` ratios. +2. **Per-op detail** — for each op, one timings table. Each row carries a + `match py` flag (`OK` / `DIFF`) comparing `decimo` against Python. + Every `DIFF` is expanded inline as a collapsible `
` block + listing every language's full result string. +3. **Agreement summary** — `decimo`-vs-Python match rate per op. + +Ratios are `decimo ÷ peer`, so **a value below `1.00x` means `decimo` is +faster** than that peer. diff --git a/benches/bigint/aggregate.py b/benches/bigint/aggregate.py new file mode 100644 index 00000000..9d4c3812 --- /dev/null +++ b/benches/bigint/aggregate.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python3 +"""Aggregate per-language CSV bench logs into a side-by-side markdown report. + +Languages: mojo (decimo.BigInt, system under test), python (int, oracle), +rust (num-bigint, compiled peer). + +BigInt is exact, so there is NO precision dimension. Log filenames look +like `__.csv`; the report shows one timings table per op. + +The `match py` column is **OK** iff `decimo` and Python agree on the result +value. Integers have a single canonical decimal form, so this is exact +string equality (with a numeric fallback). DIFF cases are expanded inside +collapsible `
` blocks listing every language's full result. +""" + +from __future__ import annotations + +import argparse +import csv +import glob +import os +import platform +import re +import shutil +import statistics +import subprocess +import sys +from datetime import datetime, timezone + +# Result strings can run into tens of thousands of characters (e.g. +# 50000-digit from_string cases). Raise the CSV field limit accordingly. +csv.field_size_limit(10_000_000) + + +LANGS_DEFAULT = ["mojo", "python", "rust"] +LANG_LABEL = {"mojo": "decimo", "python": "python", "rust": "rust"} + +# Maximum width for case names rendered in markdown tables. Long names are +# truncated with an ellipsis to keep tables readable; the full name is still +# shown verbatim inside `DIFF` blocks. +DISPLAY_NAME_MAX = 48 + + +def _short_name(name: str) -> str: + if len(name) <= DISPLAY_NAME_MAX: + return name + return name[: DISPLAY_NAME_MAX - 1] + "…" # ellipsis + + +def _values_equal(a: str, b: str) -> bool: + """Compare two integer result strings as values. + + Falls back to string equality if either side cannot be parsed as an + integer (e.g. an ``ERR: ...`` marker). + """ + if a == b: + return True + try: + return int(a) == int(b) + except (TypeError, ValueError): + return False + + +LOG_RE = re.compile(r"^(?P[a-z]+)_(?P[a-z_]+)_(?P\d{8}_\d{6})\.csv$") + + +def discover_logs(logs_dir: str) -> dict[tuple[str, str], str]: + latest: dict[tuple[str, str], tuple[str, str]] = {} + for path in glob.glob(os.path.join(logs_dir, "*.csv")): + m = LOG_RE.match(os.path.basename(path)) + if not m: + continue + key = (m.group("lang"), m.group("op")) + ts = m.group("ts") + if key not in latest or ts > latest[key][1]: + latest[key] = (path, ts) + return {k: v[0] for k, v in latest.items()} + + +def load(path: str) -> list[dict[str, str]]: + with open(path, newline="") as f: + return list(csv.DictReader(f)) + + +def fmt_num(s) -> str: + if s is None or s == "": + return "-" + try: + return f"{float(s):,.2f}" + except (TypeError, ValueError): + return str(s) + + +def fmt_ratio(num, den) -> str: + try: + n, d = float(num), float(den) + if d == 0: + return "-" + return f"{n / d:,.2f}x" + except (TypeError, ValueError): + return "-" + + +def median_ns(rows: list[dict[str, str]]) -> float | None: + vals: list[float] = [] + for r in rows: + try: + vals.append(float(r["ns_per_iter"])) + except (KeyError, ValueError): + pass + return statistics.median(vals) if vals else None + + +def _run(cmd: list[str]) -> str: + try: + out = subprocess.run( + cmd, capture_output=True, text=True, timeout=5, check=False + ).stdout.strip() + return out.splitlines()[0].strip() if out else "" + except (FileNotFoundError, subprocess.TimeoutExpired, OSError): + return "" + + +def collect_env_info() -> list[tuple[str, str]]: + info: list[tuple[str, str]] = [ + ("OS", f"{platform.system()} {platform.release()}"), + ("Arch", platform.machine()), + ] + cpu = "" + if platform.system() == "Darwin": + cpu = _run(["sysctl", "-n", "machdep.cpu.brand_string"]) + elif platform.system() == "Linux": + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.startswith("model name"): + cpu = line.split(":", 1)[1].strip() + break + except OSError: + pass + if cpu: + info.append(("CPU", cpu)) + if shutil.which("pixi"): + v = _run(["pixi", "run", "mojo", "--version"]) + if v: + info.append(("Mojo", v)) + info.append(("Python", platform.python_version())) + rustc = _run(["rustc", "--version"]) + if rustc: + info.append(("Rust", rustc)) + return info + + +def _is_numeric_col(values: list[str]) -> bool: + seen = False + for v in values: + if v in ("", "-"): + continue + seen = True + s = v.rstrip("x").replace(",", "") + try: + float(s) + except ValueError: + return False + return seen + + +# Threshold above which a DIFF result is considered "long" and worth +# folding the shared head / tail. 200 chars is roughly two lines on a +# typical viewer, so anything past that benefits from folding. +_FOLD_LONG_THRESHOLD = 200 +_FOLD_MIN_RUN = 40 +_FOLD_KEEP_EDGE = 8 + + +def _fold_diff_results(results: list[str | None]) -> list[str | None]: + """Fold long DIFF result strings around the diverging region. + + Given N result strings (some may be ``None`` for missing rows), find + the longest common prefix and longest common suffix shared by ALL + non-None results. If both are long enough, replace those runs with + ``(K same chars)`` markers, keeping a few boundary chars verbatim. + """ + real = [r for r in results if r is not None] + if not real: + return results + if max(len(r) for r in real) < _FOLD_LONG_THRESHOLD: + return results + prefix_len = 0 + min_len = min(len(r) for r in real) + while prefix_len < min_len and all( + r[prefix_len] == real[0][prefix_len] for r in real + ): + prefix_len += 1 + suffix_len = 0 + while suffix_len < min_len - prefix_len and all( + r[-1 - suffix_len] == real[0][-1 - suffix_len] for r in real + ): + suffix_len += 1 + + fold_prefix = prefix_len >= _FOLD_MIN_RUN + fold_suffix = suffix_len >= _FOLD_MIN_RUN + if not fold_prefix and not fold_suffix: + return results + + out: list[str | None] = [] + for r in results: + if r is None: + out.append(None) + continue + head_keep = _FOLD_KEEP_EDGE if fold_prefix else 0 + tail_keep = _FOLD_KEEP_EDGE if fold_suffix else 0 + head_keep = min(head_keep, prefix_len) + tail_keep = min(tail_keep, suffix_len) + head_fold = prefix_len - head_keep + tail_fold = suffix_len - tail_keep + middle_start = prefix_len + middle_end = len(r) - suffix_len + parts: list[str] = [] + if fold_prefix: + parts.append(f"({head_fold} same chars)...") + parts.append(r[prefix_len - head_keep : prefix_len]) + else: + parts.append(r[:prefix_len]) + parts.append(r[middle_start:middle_end]) + if fold_suffix: + parts.append(r[len(r) - suffix_len : len(r) - suffix_len + tail_keep]) + parts.append(f"...({tail_fold} same chars)") + else: + parts.append(r[len(r) - suffix_len :]) + out.append("".join(parts)) + return out + + +def render_aligned_table(header: list[str], rows: list[list[str]]) -> list[str]: + cols = len(header) + widths = [len(h) for h in header] + for r in rows: + for i in range(cols): + if i < len(r) and len(r[i]) > widths[i]: + widths[i] = len(r[i]) + aligns = [ + "right" if _is_numeric_col([r[i] for r in rows if i < len(r)]) else "left" + for i in range(cols) + ] + + def pad(cell: str, w: int, align: str) -> str: + return cell.rjust(w) if align == "right" else cell.ljust(w) + + out: list[str] = [] + out.append( + "| " + + " | ".join(pad(header[i], widths[i], aligns[i]) for i in range(cols)) + + " |" + ) + sep = [] + for i in range(cols): + sep.append( + "-" * (widths[i] - 1) + ":" if aligns[i] == "right" else "-" * widths[i] + ) + out.append("| " + " | ".join(sep) + " |") + for r in rows: + cells = [] + for i in range(cols): + v = r[i] if i < len(r) else "" + cells.append(pad(v, widths[i], aligns[i])) + out.append("| " + " | ".join(cells) + " |") + return out + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--logs-dir", default="logs") + ap.add_argument("--reports-dir", default="reports") + ap.add_argument("--ops", nargs="+", required=True) + ap.add_argument("--langs", nargs="+", default=LANGS_DEFAULT) + ap.add_argument("--out", default=None) + args = ap.parse_args() + + os.makedirs(args.reports_dir, exist_ok=True) + ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + now_local = datetime.now().astimezone() + offset = now_local.strftime("%z") + offset_str = f"UTC{offset[:3]}:{offset[3:]}" if len(offset) == 5 else "UTC" + header_ts = f"{now_local.strftime('%Y-%m-%d %H:%M:%S')} ({offset_str})" + out_path = args.out or os.path.join(args.reports_dir, f"bigint_report_utc_{ts}.md") + + log_index = discover_logs(args.logs_dir) + # records[op][lang][case_name] -> row dict + records: dict[str, dict[str, dict[str, dict[str, str]]]] = {} + case_orders: dict[str, list[str]] = {} + for op in args.ops: + records[op] = {} + case_orders[op] = [] + for lang in args.langs: + path = log_index.get((lang, op)) + if not path: + continue + d: dict[str, dict[str, str]] = {} + for r in load(path): + if r["case_name"] not in case_orders[op]: + case_orders[op].append(r["case_name"]) + d[r["case_name"]] = r + records[op][lang] = d + + ratio_pairs = [l for l in args.langs if l != "mojo"] + ratio_short = {"python": "py", "rust": "rs"} + + lines: list[str] = [ + "# BigInt cross-language benchmark report", + "", + f"- Generated: {header_ts}", + f"- Languages: {', '.join(LANG_LABEL.get(l, l) for l in args.langs)}", + f"- Ops: {', '.join(args.ops)}", + "- **Time unit: nanoseconds per iteration (ns/iter)** — lower is faster.", + "", + "All timing columns (`decimo`, `python`, `rust`) are **ns / iter**.", + "Each per-op timings table has a single correctness column,", + "`match py` (vs Python `int`), comparing integer values: `OK` when", + "`decimo` equals the Python oracle, `DIFF` when it disagrees, and", + "`N/A` when either the `decimo` or `python` row is missing (so", + "correctness was not checked). Only real `DIFF` cases are listed in", + "the DIFF block.", + "", + f"Ratio columns: `dm/{ratio_short.get('python')}` = decimo ÷ python, " + f"`dm/{ratio_short.get('rust')}` = decimo ÷ rust " + "(**< 1.00 means decimo is faster**).", + "", + "## 0. System & toolchain", + "", + "```txt", + ] + for k, v in collect_env_info(): + lines.append(f"{(k + ':').ljust(16)}{v}") + lines.append("```") + lines.append("") + + # ----- Section 1: cross-op overview ----- + lines.append("## 1. Cross-op overview") + lines.append("") + overview_header = ["op", "cases"] + for lang in args.langs: + overview_header.append(LANG_LABEL.get(lang, lang)) + for lang in ratio_pairs: + overview_header.append(f"dm/{ratio_short.get(lang, lang)}") + overview_rows: list[list[str]] = [] + for op in args.ops: + row = [op, str(len(case_orders[op]))] + meds: dict[str, float | None] = {} + for lang in args.langs: + rows = list(records[op].get(lang, {}).values()) + m = median_ns(rows) + meds[lang] = m + row.append(fmt_num(m)) + for lang in ratio_pairs: + row.append(fmt_ratio(meds.get("mojo"), meds.get(lang))) + overview_rows.append(row) + lines.extend(render_aligned_table(overview_header, overview_rows)) + lines.append("") + + # ----- Section 2: per-op detail ----- + lines.append("## 2. Per-op detail") + lines.append("") + for op in args.ops: + lines.append(f"### {op}") + lines.append("") + per_lang = records[op] + if not per_lang: + lines.append("_no logs found_\n") + continue + present_langs = [lang for lang in args.langs if lang in per_lang] + present_ratio_pairs = [ + lang for lang in ratio_pairs if lang in per_lang and "mojo" in per_lang + ] + + # `py_match` is tri-state: True (OK), False (DIFF), or None + # (N/A — not checked because a `mojo` or `python` row is missing). + case_records: list[tuple[str, bool | None, dict[str, dict[str, str]]]] = [] + for case in case_orders[op]: + recs = { + lang: per_lang.get(lang, {}).get(case, {}) for lang in present_langs + } + mojo_val = recs.get("mojo", {}).get("result") + py_val = recs.get("python", {}).get("result") + if mojo_val is None or py_val is None: + py_match = None + else: + py_match = _values_equal(mojo_val, py_val) + case_records.append((case, py_match, recs)) + + time_header = ["case", "match py"] + [ + LANG_LABEL.get(l, l) for l in present_langs + ] + for lang in present_ratio_pairs: + time_header.append(f"dm/{ratio_short.get(lang, lang)}") + time_body: list[list[str]] = [] + for case, py_match, recs in case_records: + if py_match is None: + py_cell = "N/A" + elif py_match: + py_cell = "OK" + else: + py_cell = "DIFF" + row = [_short_name(case), py_cell] + for lang in present_langs: + row.append( + fmt_num(recs[lang].get("ns_per_iter") if recs[lang] else None) + ) + for lang in present_ratio_pairs: + m = recs.get("mojo", {}).get("ns_per_iter") + r = recs.get(lang, {}).get("ns_per_iter") + row.append(fmt_ratio(m, r)) + time_body.append(row) + lines.extend(render_aligned_table(time_header, time_body)) + lines.append("") + + # A case is shown in the DIFF block iff decimo genuinely disagrees + # with the Python oracle (`py_match is False`). Cases that were not + # checked (`py_match is None`, e.g. a harness was skipped) are + # excluded — they are not mismatches. + diffs = [t for t in case_records if t[1] is False] + if diffs: + lines.append( + f"
{len(diffs)} DIFF case(s) at " + f"{op} — click to expand" + ) + lines.append("") + for case, _py_match, recs in diffs: + lines.append(f"**{case}**") + lines.append("") + pairs: list[tuple[str, str | None]] = [] + for lang in args.langs: + rec = recs.get(lang, {}) if lang in present_langs else {} + r = rec.get("result") if rec else None + pairs.append((LANG_LABEL.get(lang, lang), r)) + folded = _fold_diff_results([p[1] for p in pairs]) + lines.append("```") + for (label, _), shown in zip(pairs, folded): + if shown is None: + lines.append(f"{label}: (no row)") + else: + lines.append(f"{label}: {shown}") + lines.append("```") + lines.append("") + lines.append("
") + lines.append("") + + # ----- Section 3: agreement summary ----- + lines.append("## 3. decimo-vs-python agreement summary") + lines.append("") + eq_header = ["op", "checked", "matched", "mismatched", "match %"] + eq_rows: list[list[str]] = [] + for op in args.ops: + # Only count cases where BOTH `mojo` and `python` results are + # present; cases missing either side were not checked and must not + # inflate the mismatch count. `match %` is `-` when nothing was + # checked (e.g. the Python harness was skipped). + checked = 0 + matched = 0 + for case in case_orders[op]: + per_lang = records[op] + mojo_val = per_lang.get("mojo", {}).get(case, {}).get("result") + py_val = per_lang.get("python", {}).get(case, {}).get("result") + if mojo_val is None or py_val is None: + continue + checked += 1 + if _values_equal(mojo_val, py_val): + matched += 1 + pct = f"{(100.0 * matched / checked):.1f}%" if checked else "-" + eq_rows.append([op, str(checked), str(matched), str(checked - matched), pct]) + lines.extend(render_aligned_table(eq_header, eq_rows)) + + report = "\n".join(lines) + "\n" + sys.stdout.write(report) + with open(out_path, "w") as f: + f.write(report) + print(f"\n>>> Wrote {out_path}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benches/bigint/bench.mojo b/benches/bigint/bench.mojo deleted file mode 100644 index 3542f809..00000000 --- a/benches/bigint/bench.mojo +++ /dev/null @@ -1,67 +0,0 @@ -"""Unified BigInt benchmark runner. Compares BigInt10 vs BigInt vs Python int.""" - -from bench_add import main as bench_add -from bench_multiply import main as bench_multiply -from bench_floor_divide import main as bench_floor_divide -from bench_truncate_divide import main as bench_truncate_divide -from bench_sqrt import main as bench_sqrt -from bench_power import main as bench_power -from bench_from_string import main as bench_from_string -from bench_to_string import main as bench_to_string -from bench_shift import main as bench_shift - - -def main() raises: - while True: - print( - """ -========================================= - BigInt Benchmarks (BigInt10 vs BigInt) -========================================= -add: Addition -mul: Multiplication -fdiv: Floor Division -tdiv: Truncate Division -sqrt: Integer Square Root (BigUInt vs BigInt) -power: Power / Exponentiation -fromstr: String → BigInt construction -tostr: BigInt → String conversion -shift: Left Shift (BigInt only) -all: Run all benchmarks -q: Exit -========================================= -""" - ) - var command = input("Type name of bench you want to run: ") - if command == "add": - bench_add() - elif command == "mul": - bench_multiply() - elif command == "fdiv": - bench_floor_divide() - elif command == "tdiv": - bench_truncate_divide() - elif command == "sqrt": - bench_sqrt() - elif command == "power": - bench_power() - elif command == "fromstr": - bench_from_string() - elif command == "tostr": - bench_to_string() - elif command == "shift": - bench_shift() - elif command == "all": - bench_add() - bench_multiply() - bench_floor_divide() - bench_truncate_divide() - bench_sqrt() - bench_power() - bench_from_string() - bench_to_string() - bench_shift() - elif command == "q": - return - else: - print("Invalid input") diff --git a/benches/bigint/bench_add.mojo b/benches/bigint/bench_add.mojo deleted file mode 100644 index d9e8f831..00000000 --- a/benches/bigint/bench_add.mojo +++ /dev/null @@ -1,124 +0,0 @@ -"""Benchmarks for BigInt addition. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -import decimo.bigint10.arithmetics -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print("a: " + bc.a[byte=:80], log_file) - log_print("b: " + bc.b[byte=:80], log_file) - - var m1a = BigInt10(bc.a) - var m1b = BigInt10(bc.b) - var m2a = BigInt(bc.a) - var m2b = BigInt(bc.b) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r1 = m1a + m1b - var r2 = m2a + m2b - var rp = pa + pb - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt10 result: " + r1_str[byte=:120], log_file) - log_print("BigInt result: " + r2_str[byte=:120], log_file) - log_print("Python result: " + rp_str[byte=:120], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m1a + m1b - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = m2a + m2b - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa + pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_add") - print_header("Decimo BigInt Addition Benchmark", log_file) - - var cases = load_bench_cases("bench_data/add.toml") - var iterations = load_bench_iterations("bench_data/add.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " addition benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Addition Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_data/truncate_divide.toml b/benches/bigint/bench_data/truncate_divide.toml deleted file mode 100644 index 11dad594..00000000 --- a/benches/bigint/bench_data/truncate_divide.toml +++ /dev/null @@ -1,117 +0,0 @@ -[config] -iterations = 100 - -[[cases]] -a = "100" -b = "10" -name = "Simple division, no remainder" - -[[cases]] -a = "10" -b = "3" -name = "Division with remainder" - -[[cases]] -a = "7" -b = "2" -name = "Division of small numbers" - -[[cases]] -a = "5" -b = "10" -name = "Division resulting in zero" - -[[cases]] -a = "12345" -b = "1" -name = "Division by one" - -[[cases]] -a = "-10" -b = "3" -name = "Negative dividend, positive divisor" - -[[cases]] -a = "10" -b = "-3" -name = "Positive dividend, negative divisor" - -[[cases]] -a = "-10" -b = "-3" -name = "Negative dividend, negative divisor" - -[[cases]] -a = "0" -b = "5" -name = "Zero dividend" - -[[cases]] -a = "9999999999" -b = "333" -name = "Large number division" - -[[cases]] -a = "1{0,50}" -b = "7" -name = "Very large number division" - -[[cases]] -a = "1{0,30}" -b = "1{0,10}" -name = "Division of large numbers with exact result" - -[[cases]] -a = "12345" -b = "{9,20}" -name = "Division by large number" - -[[cases]] -a = "6765" -b = "4181" -name = "Fibonacci number division" - -[[cases]] -a = "2147483647" -b = "997" -name = "Prime number division" - -[[cases]] -a = "9223372036854775807" -b = "2" -name = "Division near Int64 limit" - -[[cases]] -a = "{12345,10}" -b = "{6789,12}" -name = "Division with around 50 digits divisor just below dividend" - -[[cases]] -a = "1{0,20}" -b = "1{0,5}" -name = "Division with exact powers of 10" - -[[cases]] -a = "{990132857498314692374162398217,10}" -b = "{85172390413429847239,10}" -name = "Division of repeated digits" - -[[cases]] -a = "{9,100}" -b = "3" -name = "Extreme large dividend and small divisor" - -[[cases]] -a = "{9,1000}" -b = "{3,500}" -name = "1000-digit dividend / 500-digit divisor" - -[[cases]] -a = "{9,5000}" -b = "{7,2500}" -name = "5000-digit dividend / 2500-digit divisor" - -[[cases]] -a = "{9,10000}" -b = "{7,5000}" -name = "10000-digit dividend / 5000-digit divisor" diff --git a/benches/bigint/bench_floor_divide.mojo b/benches/bigint/bench_floor_divide.mojo deleted file mode 100644 index e33442fb..00000000 --- a/benches/bigint/bench_floor_divide.mojo +++ /dev/null @@ -1,124 +0,0 @@ -"""Benchmarks for BigInt floor division. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -import decimo.bigint10.arithmetics -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print("a: " + bc.a[byte=:80], log_file) - log_print("b: " + bc.b[byte=:80], log_file) - - var m1a = BigInt10(bc.a) - var m1b = BigInt10(bc.b) - var m2a = BigInt(bc.a) - var m2b = BigInt(bc.b) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r1 = decimo.bigint10.arithmetics.floor_divide(m1a, m1b) - var r2 = decimo.bigint.arithmetics.floor_divide(m2a, m2b) - var rp = pa // pb - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt10 result: " + r1_str[byte=:80], log_file) - log_print("BigInt result: " + r2_str[byte=:80], log_file) - log_print("Python result: " + rp_str[byte=:80], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = decimo.bigint10.arithmetics.floor_divide(m1a, m1b) - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = decimo.bigint.arithmetics.floor_divide(m2a, m2b) - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa // pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_floor_divide") - print_header("Decimo BigInt Floor Division Benchmark", log_file) - - var cases = load_bench_cases("bench_data/floor_divide.toml") - var iterations = load_bench_iterations("bench_data/floor_divide.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " floor division benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Floor Division Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_from_string.mojo b/benches/bigint/bench_from_string.mojo deleted file mode 100644 index a862a1a5..00000000 --- a/benches/bigint/bench_from_string.mojo +++ /dev/null @@ -1,103 +0,0 @@ -"""Benchmarks for BigInt from_string construction. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -from decimo.bigint.bigint import BigInt -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print( - "a: " + bc.a[byte=:80] + (" ..." if len(bc.a) > 80 else ""), log_file - ) - log_print("digits: " + String(len(bc.a)), log_file) - - var py = Python.import_module("builtins") - - try: - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = BigInt10(bc.a) - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = BigInt(bc.a) - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = py.int(bc.a) - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_from_string") - print_header("Decimo BigInt from_string Benchmark", log_file) - - var cases = load_bench_cases("bench_data/from_string.toml") - var iterations = load_bench_iterations("bench_data/from_string.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " from_string benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt from_string Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_multiply.mojo b/benches/bigint/bench_multiply.mojo deleted file mode 100644 index 1775adcb..00000000 --- a/benches/bigint/bench_multiply.mojo +++ /dev/null @@ -1,124 +0,0 @@ -"""Benchmarks for BigInt multiplication. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -import decimo.bigint10.arithmetics -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print("a: " + bc.a[byte=:80], log_file) - log_print("b: " + bc.b[byte=:80], log_file) - - var m1a = BigInt10(bc.a) - var m1b = BigInt10(bc.b) - var m2a = BigInt(bc.a) - var m2b = BigInt(bc.b) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r1 = m1a * m1b - var r2 = m2a * m2b - var rp = pa * pb - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt10 result: " + r1_str[byte=:80], log_file) - log_print("BigInt result: " + r2_str[byte=:80], log_file) - log_print("Python result: " + rp_str[byte=:80], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m1a * m1b - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = m2a * m2b - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa * pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_multiply") - print_header("Decimo BigInt Multiplication Benchmark", log_file) - - var cases = load_bench_cases("bench_data/multiply.toml") - var iterations = load_bench_iterations("bench_data/multiply.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " multiplication benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Multiplication Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_power.mojo b/benches/bigint/bench_power.mojo deleted file mode 100644 index 89675b70..00000000 --- a/benches/bigint/bench_power.mojo +++ /dev/null @@ -1,122 +0,0 @@ -"""Benchmarks for BigInt exponentiation. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print("base: " + bc.a[byte=:80], log_file) - log_print("exp: " + bc.b[byte=:80], log_file) - - var m1_base = BigInt10(bc.a) - var base = BigInt(bc.a) - var exp_int = Int(BigInt(bc.b)) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r1 = m1_base.power(exp_int) - var r2 = base**exp_int - var rp = pa**pb - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt10 result: " + r1_str[byte=:80], log_file) - log_print("BigInt result: " + r2_str[byte=:80], log_file) - log_print("Python result: " + rp_str[byte=:80], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m1_base.power(exp_int) - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = base**exp_int - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa**pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_power") - print_header("Decimo BigInt Power Benchmark", log_file) - - var cases = load_bench_cases("bench_data/power.toml") - var iterations = load_bench_iterations("bench_data/power.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " power benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Power Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_shift.mojo b/benches/bigint/bench_shift.mojo deleted file mode 100644 index 01929d1a..00000000 --- a/benches/bigint/bench_shift.mojo +++ /dev/null @@ -1,104 +0,0 @@ -"""Benchmarks for BigInt left shift. Compares BigInt vs Python int.""" - -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print( - "a: " + bc.a[byte=:80] + (" ..." if len(bc.a) > 80 else ""), log_file - ) - log_print("shift: " + bc.b, log_file) - - var m2a = BigInt(bc.a) - var shift = Int(BigInt(bc.b)) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r2 = m2a << shift - var rp = pa << pb - - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt result: " + r2_str[byte=:80], log_file) - log_print("Python result: " + rp_str[byte=:80], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m2a << shift - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa << pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s2 = Float64(tp) / Float64(t2) - sf.append(s2) - - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("Speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_shift") - print_header("Decimo BigInt Left Shift Benchmark", log_file) - - var cases = load_bench_cases("bench_data/shift.toml") - var iterations = load_bench_iterations("bench_data/shift.toml") - var sf = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " shift benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf) - - print_summary( - "BigInt Left Shift Benchmark Summary", - sf, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_sqrt.mojo b/benches/bigint/bench_sqrt.mojo deleted file mode 100644 index d83ef224..00000000 --- a/benches/bigint/bench_sqrt.mojo +++ /dev/null @@ -1,123 +0,0 @@ -"""Benchmarks for BigInt integer square root. Compares BigUInt, BigInt, and Python isqrt.""" - -from decimo.biguint.biguint import BigUInt -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -import decimo.bigint.exponential -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_biguint: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print( - "a: " + bc.a[byte=:80] + (" ..." if len(bc.a) > 80 else ""), log_file - ) - - var m1a = BigUInt(bc.a) - var m2a = BigInt(bc.a) - var py = Python.import_module("builtins") - var math_mod = Python.import_module("math") - var pa = py.int(bc.a) - - try: - var r1 = m1a.sqrt() - var r2 = m2a.sqrt() - var rp = math_mod.isqrt(pa) - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigUInt result: " + r1_str[byte=:80], log_file) - log_print("BigInt result: " + r2_str[byte=:80], log_file) - log_print("Python result: " + rp_str[byte=:80], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m1a.sqrt() - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = m2a.sqrt() - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = math_mod.isqrt(pa) - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_biguint.append(s1) - sf_bigint.append(s2) - - log_print("BigUInt: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigUInt speedup: " + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_sqrt") - print_header("Decimo BigInt Square Root Benchmark", log_file) - - var cases = load_bench_cases("bench_data/sqrt.toml") - var iterations = load_bench_iterations("bench_data/sqrt.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " sqrt benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Square Root Benchmark Summary", - sf1, - "BigUInt", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_to_string.mojo b/benches/bigint/bench_to_string.mojo deleted file mode 100644 index ecd12db2..00000000 --- a/benches/bigint/bench_to_string.mojo +++ /dev/null @@ -1,109 +0,0 @@ -"""Benchmarks for BigInt to_string conversion. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -from decimo.bigint.bigint import BigInt -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - - var m1 = BigInt10(bc.a) - var m2 = BigInt(bc.a) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - - log_print("digits: " + String(len(bc.a)), log_file) - - try: - # Verify results match - var _r1 = String(m1) - var _r2 = String(m2) - var _rp = String(py.str(pa)) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = String(m1) - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = String(m2) - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = py.str(pa) - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_to_string") - print_header("Decimo BigInt to_string Benchmark", log_file) - - var cases = load_bench_cases("bench_data/to_string.toml") - var iterations = load_bench_iterations("bench_data/to_string.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " to_string benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt to_string Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_truncate_divide.mojo b/benches/bigint/bench_truncate_divide.mojo deleted file mode 100644 index 0836f9a6..00000000 --- a/benches/bigint/bench_truncate_divide.mojo +++ /dev/null @@ -1,124 +0,0 @@ -"""Benchmarks for BigInt truncate division. Compares BigInt10, BigInt, and Python int.""" - -from decimo.bigint10.bigint10 import BigInt10 -import decimo.bigint10.arithmetics -from decimo.bigint.bigint import BigInt -import decimo.bigint.arithmetics -from decimo.tests import ( - BenchCase, - load_bench_cases, - load_bench_iterations, - open_log_file, - log_print, - print_header, - print_summary_dual, -) -from std.python import Python, PythonObject -from std.time import perf_counter_ns -from std.collections import List - - -def run_case( - bc: BenchCase, - iterations: Int, - log_file: PythonObject, - mut sf_bigint10: List[Float64], - mut sf_bigint: List[Float64], -) raises: - log_print("\nBenchmark: " + bc.name, log_file) - log_print("a: " + bc.a[byte=:80], log_file) - log_print("b: " + bc.b[byte=:80], log_file) - - var m1a = BigInt10(bc.a) - var m1b = BigInt10(bc.b) - var m2a = BigInt(bc.a) - var m2b = BigInt(bc.b) - var py = Python.import_module("builtins") - var pa = py.int(bc.a) - var pb = py.int(bc.b) - - try: - var r1 = m1a.truncate_divide(m1b) - var r2 = m2a.truncate_divide(m2b) - var rp = pa // pb - - var r1_str = String(r1) - var r2_str = String(r2) - var rp_str = String(rp) - - # Correctness check: string match - if r1_str != rp_str or r2_str != rp_str: - log_print("*** WARNING: String mismatch detected! ***", log_file) - log_print("BigInt10 result: " + r1_str[byte=:120], log_file) - log_print("BigInt result: " + r2_str[byte=:120], log_file) - log_print("Python result: " + rp_str[byte=:120], log_file) - - var t0 = perf_counter_ns() - for _ in range(iterations): - _ = m1a.truncate_divide(m1b) - var t1 = (perf_counter_ns() - t0) / UInt(iterations) - if t1 == 0: - t1 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = m2a.truncate_divide(m2b) - var t2 = (perf_counter_ns() - t0) / UInt(iterations) - if t2 == 0: - t2 = 1 - - t0 = perf_counter_ns() - for _ in range(iterations): - _ = pa // pb - var tp = (perf_counter_ns() - t0) / UInt(iterations) - - var s1 = Float64(tp) / Float64(t1) - var s2 = Float64(tp) / Float64(t2) - sf_bigint10.append(s1) - sf_bigint.append(s2) - - log_print("BigInt10: " + String(t1) + " ns/iter", log_file) - log_print("BigInt: " + String(t2) + " ns/iter", log_file) - log_print("Python: " + String(tp) + " ns/iter", log_file) - log_print("BigInt10 speedup:" + String(s1) + "×", log_file) - log_print("BigInt speedup: " + String(s2) + "×", log_file) - except e: - log_print("Error: " + String(e), log_file) - log_print("Skipping this case", log_file) - - -def main() raises: - var pysys = Python.import_module("sys") - pysys.set_int_max_str_digits(10000000) - - var log_file = open_log_file("benchmark_bigint_truncate_divide") - print_header("Decimo BigInt Truncate Division Benchmark", log_file) - - var cases = load_bench_cases("bench_data/truncate_divide.toml") - var iterations = load_bench_iterations("bench_data/truncate_divide.toml") - var sf1 = List[Float64]() - var sf2 = List[Float64]() - - log_print( - "\nRunning " - + String(len(cases)) - + " truncate division benchmarks with " - + String(iterations) - + " iterations each", - log_file, - ) - - for i in range(len(cases)): - run_case(cases[i], iterations, log_file, sf1, sf2) - - print_summary_dual( - "BigInt Truncate Division Benchmark Summary", - sf1, - "BigInt10", - sf2, - "BigInt", - iterations, - log_file, - ) - log_file.close() - print("Benchmark completed. Log file closed.") diff --git a/benches/bigint/bench_data/add.toml b/benches/bigint/cases/add.toml similarity index 100% rename from benches/bigint/bench_data/add.toml rename to benches/bigint/cases/add.toml diff --git a/benches/bigint/bench_data/floor_divide.toml b/benches/bigint/cases/floor_divide.toml similarity index 100% rename from benches/bigint/bench_data/floor_divide.toml rename to benches/bigint/cases/floor_divide.toml diff --git a/benches/bigint/bench_data/from_string.toml b/benches/bigint/cases/from_string.toml similarity index 100% rename from benches/bigint/bench_data/from_string.toml rename to benches/bigint/cases/from_string.toml diff --git a/benches/bigint/bench_data/multiply.toml b/benches/bigint/cases/multiply.toml similarity index 100% rename from benches/bigint/bench_data/multiply.toml rename to benches/bigint/cases/multiply.toml diff --git a/benches/bigint/bench_data/power.toml b/benches/bigint/cases/power.toml similarity index 100% rename from benches/bigint/bench_data/power.toml rename to benches/bigint/cases/power.toml diff --git a/benches/bigint/bench_data/shift.toml b/benches/bigint/cases/shift.toml similarity index 100% rename from benches/bigint/bench_data/shift.toml rename to benches/bigint/cases/shift.toml diff --git a/benches/bigint/bench_data/sqrt.toml b/benches/bigint/cases/sqrt.toml similarity index 100% rename from benches/bigint/bench_data/sqrt.toml rename to benches/bigint/cases/sqrt.toml diff --git a/benches/bigint/bench_data/to_string.toml b/benches/bigint/cases/to_string.toml similarity index 100% rename from benches/bigint/bench_data/to_string.toml rename to benches/bigint/cases/to_string.toml diff --git a/benches/bigint/mojo/bench.mojo b/benches/bigint/mojo/bench.mojo new file mode 100644 index 00000000..1c9620c8 --- /dev/null +++ b/benches/bigint/mojo/bench.mojo @@ -0,0 +1,296 @@ +# Cross-language BigInt benchmark — Mojo (decimo.BigInt) side. +# +# Reads cases/.toml (shared across languages), expands {C,N} repeat +# patterns, auto-tunes iteration count to ~50ms per case, and emits one CSV +# record per case to logs/mojo__.csv. Schema (mirrors decimal128/): +# +# timestamp,language,op,case_name,result,ns_per_iter +# +# Unlike the BigDecimal harness there is NO precision parameter: BigInt +# arithmetic is always exact. +# +# Usage: +# pixi run mojo run -I ../../../src --debug-level=line-tables -D ASSERT=none \ +# ./bench.mojo --op multiply --cases-dir ../cases --logs-dir ../logs +# +# Available ops: add, multiply, floor_divide, power, shift, sqrt, +# from_string, to_string. + +from decimo.bigint.bigint import BigInt +import decimo.bigint.arithmetics +import decimo.bigint.exponential +from decimo.tests import ( + BenchCase, + load_bench_cases, + load_bench_iterations, +) +from std.benchmark import keep +from std.python import Python +from std.sys import argv as sys_argv +from std.time import perf_counter_ns + + +def _now_stamp() raises -> String: + var dt = Python.import_module("datetime") + var now = dt.datetime.now(dt.timezone.utc) + return String(now.strftime("%Y%m%d_%H%M%S")) + + +def _csv_quote(s: String) -> String: + var needs = False + for ch in s.codepoint_slices(): + if ch == "," or ch == '"' or ch == "\n" or ch == "\r": + needs = True + break + if not needs: + return s + var out = String('"') + for ch in s.codepoint_slices(): + if ch == '"': + out += '""' + else: + out += String(ch) + out += '"' + return out + + +def _result_for( + op: String, + read a: BigInt, + read b: BigInt, + a_str: String, + b_int: Int, +) raises -> String: + """Display path: produce the result string ONCE per case. + + Never call this inside a timing loop — use `_time_kernel` instead. + """ + if op == "add": + return String(a + b) + if op == "multiply": + return String(a * b) + if op == "floor_divide": + return String(a // b) + if op == "power": + return String(a**b_int) + if op == "shift": + return String(a << b_int) + if op == "sqrt": + return String(a.sqrt()) + if op == "from_string": + return String(BigInt(a_str)) + if op == "to_string": + return String(a) + raise Error("unknown op: " + op) + + +def _time_kernel( + op: String, + read a: BigInt, + read b: BigInt, + a_str: String, + b_int: Int, +) raises: + """Pure-numeric kernel for the timing loop. + + Performs the same work as `_result_for` but renders to a String only + for the `to_string` / `from_string` ops (where parsing / rendering IS + the operation under measurement). For every other op it uses + `keep(...)` on a small derivative of the result (`len(words)`, + `sign`) to prevent dead-code elimination while keeping the keep cost + negligible versus the op. + + Operands `a` / `b` are taken as `read` (borrowed) so no per-iter + deep copy of the heap-backed word list occurs. + """ + if op == "add": + var r = a + b + keep(len(r.words)) + keep(r.sign) + return + if op == "multiply": + var r = a * b + keep(len(r.words)) + keep(r.sign) + return + if op == "floor_divide": + var r = a // b + keep(len(r.words)) + keep(r.sign) + return + if op == "power": + var r = a**b_int + keep(len(r.words)) + keep(r.sign) + return + if op == "shift": + var r = a << b_int + keep(len(r.words)) + keep(r.sign) + return + if op == "sqrt": + var r = a.sqrt() + keep(len(r.words)) + keep(r.sign) + return + if op == "from_string": + var v = BigInt(a_str) + keep(len(v.words)) + keep(v.sign) + return + if op == "to_string": + var s = String(a) + keep(s.byte_length()) + return + raise Error("unknown op: " + op) + + +# Auto-tune iters: target ~50ms per timed run. +# Includes a resolution floor (≥100µs total per rep) so cheap ops don't +# collapse to <1 timer-tick and report 0 ns/iter. Returns (iters, reps): +# reps shrinks to 1 for very-slow ops to bound wall time per case at ~500ms. +def _tune_iters(initial_ns: UInt, hint_iters: Int) -> Tuple[Int, Int]: + comptime TARGET_NS: UInt = 50_000_000 # 50ms per rep target + comptime MIN_RES_NS: UInt = 100_000 # 100µs floor for resolution + comptime MAX_WALL_NS: UInt = 500_000_000 # 500ms total per case + var cal = initial_ns if initial_ns > 0 else UInt(1) + var n = Int(TARGET_NS // cal) + var n_min_res = Int(MIN_RES_NS // cal) + if n < n_min_res: + n = n_min_res + if n < 3: + n = 3 + if n > hint_iters: + n = hint_iters + if n < 1: + n = 1 + var per_rep = UInt(n) * cal + var reps = 3 + if per_rep > 0: + var r = Int(MAX_WALL_NS // per_rep) + if r < 1: + r = 1 + if r > 3: + r = 3 + reps = r + return Tuple[Int, Int](n, reps) + + +def _bench_case( + op: String, + bc: BenchCase, + iter_hint: Int, +) raises -> Tuple[String, Float64]: + """Compute result + best-of-N ns/iter (auto-tuned).""" + # Build operands once. + var a = BigInt(bc.a) + + # `b` handling per op: + # - power / shift: b encodes the (small) integer exponent / shift count + # - sqrt / from_string / to_string: unary, no b + # - add / multiply / floor_divide: b is the second BigInt operand + var b: BigInt + var b_int: Int = 0 + if op == "power" or op == "shift": + b = BigInt.from_int(0) + b_int = Int(BigInt(bc.b)) + elif op == "sqrt" or op == "from_string" or op == "to_string" or bc.b == "": + b = BigInt.from_int(0) + else: + b = BigInt(bc.b) + + # Compute the displayed `result` ONCE per case (outside any timing loop). + var result = _result_for(op, a, b, bc.a, b_int) + + # Calibration: time 1 rep to estimate per-iter cost. + var cal_iters: Int = 1 + var t0 = perf_counter_ns() + for _ in range(cal_iters): + _time_kernel(op, a, b, bc.a, b_int) + var cal_ns = UInt(perf_counter_ns() - t0) + var tuned = _tune_iters(cal_ns, iter_hint) + var iters = tuned[0] + var reps = tuned[1] + + # Best-of-N timing (N = reps, adaptive). + var best: Int = 0x7FFF_FFFF_FFFF_FFFF + for _ in range(reps): + var t1 = perf_counter_ns() + for _ in range(iters): + _time_kernel(op, a, b, bc.a, b_int) + var dt = Int(perf_counter_ns() - t1) + if dt < best: + best = dt + return Tuple[String, Float64](result, Float64(best) / Float64(iters)) + + +def _pad(s: String, w: Int) -> String: + if s.byte_length() >= w: + return s + var out = s + for _ in range(w - s.byte_length()): + out += " " + return out + + +def main() raises: + var pysys = Python.import_module("sys") + pysys.set_int_max_str_digits(10000000) + + var argv = sys_argv() + var op = String("add") + var cases_dir = String("../cases") + var logs_dir = String("../logs") + var i = 1 + while i < len(argv): + var arg = String(argv[i]) + if arg == "--op": + op = String(argv[i + 1]) + i += 2 + elif arg == "--cases-dir": + cases_dir = String(argv[i + 1]) + i += 2 + elif arg == "--logs-dir": + logs_dir = String(argv[i + 1]) + i += 2 + else: + i += 1 + + var toml_path = cases_dir + "/" + op + ".toml" + var iter_hint = load_bench_iterations(toml_path) + var cases = load_bench_cases(toml_path) + + var os_mod = Python.import_module("os") + if not os_mod.path.exists(logs_dir): + os_mod.makedirs(logs_dir) + var ts = _now_stamp() + var log_path = logs_dir + "/mojo_" + op + "_" + ts + ".csv" + var py = Python.import_module("builtins") + var log = py.open(log_path, "w") + log.write("timestamp,language,op,case_name,result,ns_per_iter\n") + + print("# decimo.BigInt", op, "(hint=", iter_hint, ")") + print(_pad("case", 44), _pad("result", 36), "ns/iter") + for ref bc in cases: + var pair = _bench_case(op, bc, iter_hint) + var result = pair[0] + var per = pair[1] + var rs = result if result.byte_length() <= 34 else String( + result[byte=0:34] + ) + print(_pad(bc.name, 44), _pad(rs, 36), per) + log.write( + ts + + ",mojo," + + op + + "," + + _csv_quote(bc.name) + + "," + + _csv_quote(result) + + "," + + String(per) + + "\n" + ) + log.flush() + log.close() + print("wrote", log_path) diff --git a/benches/bigint/python/bench.py b/benches/bigint/python/bench.py new file mode 100644 index 00000000..f614b4ca --- /dev/null +++ b/benches/bigint/python/bench.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""Cross-language BigInt benchmark — Python (int) side. + +Reads cases/.toml, expands `{C,N}` patterns, auto-tunes iteration count +to ~50ms per case, and emits one CSV per case to logs/python__.csv: + + timestamp,language,op,case_name,result,ns_per_iter + +Python's arbitrary-precision `int` is the **oracle**: the aggregator marks +the `match` column as OK iff every other language's result string equals +Python's. BigInt arithmetic is exact, so there is no precision parameter. + +Usage: + python3 bench.py --op multiply --cases-dir ../cases --logs-dir ../logs +""" + +from __future__ import annotations + +import argparse +import csv +import math +import os +import re +import sys +import time +from datetime import datetime, timezone + +try: + import tomllib # py 3.11+ +except ImportError: + import tomli as tomllib # type: ignore + +# BigInt cases reach tens of thousands of decimal digits; lift CPython's +# int<->str conversion guard (default 4300) so from_string / to_string work. +sys.set_int_max_str_digits(10_000_000) + +PATTERN_RE = re.compile(r"\{([^{}]*),(\d+)\}") + + +def expand(s: str) -> str: + """Expand `{C,N}` repeat patterns. Last comma wins (matches Mojo).""" + out = [] + i = 0 + while i < len(s): + if s[i] == "{": + close = s.find("}", i + 1) + if close < 0: + out.append(s[i]) + i += 1 + continue + inner = s[i + 1 : close] + comma = inner.rfind(",") + if comma < 0: + out.append(s[i : close + 1]) + i = close + 1 + continue + payload = inner[:comma] + try: + n = int(inner[comma + 1 :]) + except ValueError: + out.append(s[i : close + 1]) + i = close + 1 + continue + out.append(payload * n) + i = close + 1 + else: + out.append(s[i]) + i += 1 + return "".join(out) + + +# ----- per-op kernels --------------------------------------------------- + + +def make_kernel(op: str, a: str, b: str): + """Return (display_result_str, kernel_callable_no_args).""" + da = int(a) + # `b` is the second operand (add/multiply/floor_divide) or the small + # integer exponent / shift count (power/shift). Unary ops ignore it. + db = int(b) if b not in ("", None) else 0 + + if op == "add": + return str(da + db), (lambda: da + db) + if op == "multiply": + return str(da * db), (lambda: da * db) + if op == "floor_divide": + return str(da // db), (lambda: da // db) + if op == "power": + return str(da**db), (lambda: da**db) + if op == "shift": + return str(da << db), (lambda: da << db) + if op == "sqrt": + return str(math.isqrt(da)), (lambda: math.isqrt(da)) + if op == "from_string": + return str(int(a)), (lambda: int(a)) + if op == "to_string": + return str(da), (lambda: str(da)) + raise ValueError(f"unknown op: {op}") + + +# ----- timing ----------------------------------------------------------- + +TARGET_NS = 50_000_000 +MIN_RES_NS = 100_000 # 100µs floor per rep for resolution +MAX_WALL_NS = 500_000_000 # 500ms total wall per case + + +def bench_kernel(kernel, iter_hint: int) -> float: + """Return best-of-N ns/iter, auto-tuned. + + Mirrors the Mojo harness: target ~50ms per rep, a 100µs resolution + floor so cheap ops do not collapse to 0 ns/iter, and an adaptive + `reps` (3 -> 1) bounding wall time per case at ~500ms. + """ + t0 = time.perf_counter_ns() + r = kernel() + cal = time.perf_counter_ns() - t0 + if cal <= 0: + cal = 1 + n = TARGET_NS // cal + n_min_res = MIN_RES_NS // cal + if n < n_min_res: + n = n_min_res + if n < 3: + n = 3 + if n > iter_hint: + n = iter_hint + if n < 1: + n = 1 + iters = int(n) + per_rep = iters * cal + reps = 3 + if per_rep > 0: + reps = max(1, min(3, MAX_WALL_NS // per_rep)) + best = 1 << 62 + for _ in range(int(reps)): + t0 = time.perf_counter_ns() + for _ in range(iters): + r = kernel() + dt = time.perf_counter_ns() - t0 + if dt < best: + best = dt + _ = r + return best / iters + + +# ----- main ------------------------------------------------------------- + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument("--op", required=True) + ap.add_argument("--cases-dir", default="../cases") + ap.add_argument("--logs-dir", default="../logs") + args = ap.parse_args() + + toml_path = os.path.join(args.cases_dir, f"{args.op}.toml") + with open(toml_path, "rb") as f: + doc = tomllib.load(f) + cfg = doc.get("config", {}) + iter_hint = int(cfg.get("iterations", 1000)) + cases = doc.get("cases", []) + + os.makedirs(args.logs_dir, exist_ok=True) + ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") + log_path = os.path.join(args.logs_dir, f"python_{args.op}_{ts}.csv") + print(f"# python int {args.op} (hint={iter_hint})") + print(f"{'case':<44}{'result':<36}ns/iter") + with open(log_path, "w", newline="") as fout: + w = csv.writer(fout, lineterminator="\n") + w.writerow( + [ + "timestamp", + "language", + "op", + "case_name", + "result", + "ns_per_iter", + ] + ) + for c in cases: + name = c["name"] + a = expand(c["a"]) + b = expand(c.get("b", "")) if c.get("b") not in (None, "") else "" + try: + result, kernel = make_kernel(args.op, a, b) + per_ns = bench_kernel(kernel, iter_hint) + except Exception as exc: + result = f"ERR: {exc.__class__.__name__}: {exc}" + per_ns = 0.0 + short = result if len(result) <= 34 else result[:34] + print(f"{name:<44}{short:<36}{per_ns:.2f}") + w.writerow([ts, "python", args.op, name, result, f"{per_ns:.4f}"]) + print(f"wrote {log_path}", file=sys.stderr) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benches/bigint/run_all.sh b/benches/bigint/run_all.sh new file mode 100755 index 00000000..1dc865ea --- /dev/null +++ b/benches/bigint/run_all.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# Run BigInt benchmarks for mojo (decimo) / python / rust, then write a +# timestamped markdown report under reports/. +# +# Raw per-language CSV logs land in logs/{lang}_{op}_{ts}.csv +# Aggregated markdown report lands in reports/bigint_report_utc_{ts}.md +# +# BigInt is exact: there is NO precision parameter. +# +# Usage: +# ./run_all.sh # all default ops +# ./run_all.sh --ops multiply power # subset of ops +# +# Default ops: add multiply floor_divide power shift sqrt from_string to_string +set -uo pipefail + +cd "$(dirname "$0")" + +OPS=() +while [ $# -gt 0 ]; do + case "$1" in + --ops) shift ;; + --) shift; break ;; + -*) echo "Unknown flag: $1" >&2; exit 2 ;; + *) OPS+=("$1"); shift ;; + esac +done + +if [ ${#OPS[@]} -eq 0 ]; then + OPS=(add multiply floor_divide power shift sqrt from_string to_string) +fi + +mkdir -p logs reports + +# Always purge any prior *.csv so an old run's data does not leak into the +# aggregated report. +rm -f logs/*.csv 2>/dev/null || true + +PIXI_RUN="pixi run --manifest-path ../../../pixi.toml" +PIXI_RUN_TOP="pixi run --manifest-path ../../pixi.toml" + +# ---- Build Mojo harness ---- +HAVE_MOJO=0 +echo ">>> Building Mojo harness (release: -O3, no debug, no asserts)..." +if (cd mojo && $PIXI_RUN mojo build \ + -I ../../../src -O3 -g0 -D ASSERT=none ./bench.mojo -o ./bench); then + HAVE_MOJO=1 +else + echo "!!! Mojo build failed; skipping mojo harness." +fi + +# ---- Build Rust harness ---- +HAVE_RUST=0 +if command -v cargo >/dev/null 2>&1; then + echo ">>> Building Rust harness (release)..." + if (cd rust && cargo build --release --quiet); then + HAVE_RUST=1 + RUST_BIN="$(pwd)/rust/target/release/bench" + else + echo "!!! Rust build failed; skipping rust harness." + fi +else + echo ">>> cargo not found; skipping rust harness." +fi + +# ---- Check Python (needs tomllib OR tomli) ---- +HAVE_PY=0 +if $PIXI_RUN_TOP python3 -c 'import tomllib' 2>/dev/null \ + || $PIXI_RUN_TOP python3 -c 'import tomli' 2>/dev/null; then + HAVE_PY=1 +else + echo ">>> python (with tomllib >=3.11 or tomli) not available; skipping python harness." +fi + +run_step() { + local label="$1"; shift + echo "--- $label ---" + "$@" || echo "!!! $label failed (continuing)" +} + +for op in "${OPS[@]}"; do + echo + echo "===== $op =====" + + if [ "$HAVE_MOJO" = "1" ]; then + run_step "decimo (mojo)" \ + bash -c "cd mojo && $PIXI_RUN ./bench --op '$op' --cases-dir ../cases --logs-dir ../logs" + fi + if [ "$HAVE_PY" = "1" ]; then + run_step "python int" \ + bash -c "cd python && $PIXI_RUN python3 bench.py --op '$op' --cases-dir ../cases --logs-dir ../logs" + fi + if [ "$HAVE_RUST" = "1" ]; then + run_step "rust num-bigint" \ + "$RUST_BIN" --op "$op" --cases-dir "$(pwd)/cases" --logs-dir "$(pwd)/logs" + fi +done + +echo +echo ">>> Aggregating into report..." +$PIXI_RUN_TOP python3 ./aggregate.py \ + --logs-dir logs \ + --reports-dir reports \ + --langs mojo python rust \ + --ops "${OPS[@]}" diff --git a/benches/bigint/rust/.gitignore b/benches/bigint/rust/.gitignore new file mode 100644 index 00000000..ea8c4bf7 --- /dev/null +++ b/benches/bigint/rust/.gitignore @@ -0,0 +1 @@ +/target diff --git a/benches/bigint/rust/Cargo.toml b/benches/bigint/rust/Cargo.toml new file mode 100644 index 00000000..3d08b74f --- /dev/null +++ b/benches/bigint/rust/Cargo.toml @@ -0,0 +1,17 @@ +[package] +edition = "2021" +name = "bench" +version = "0.1.0" + +[dependencies] +num-bigint = "0.4" +num-integer = "0.1" +num-traits = "0.2" +serde = {version = "1", features = ["derive"]} +toml = "0.8" + +[profile.release] +codegen-units = 1 +debug = 1 +lto = "fat" +opt-level = 3 diff --git a/benches/bigint/rust/src/main.rs b/benches/bigint/rust/src/main.rs new file mode 100644 index 00000000..4ebf0646 --- /dev/null +++ b/benches/bigint/rust/src/main.rs @@ -0,0 +1,312 @@ +// Cross-language BigInt benchmark — Rust side (num-bigint). +// +// Reads cases/.toml (shared with the Mojo / Python sides), expands +// `{C,N}` repeat patterns, auto-tunes the iteration count to ~50ms per +// case, and emits one CSV record per case to stdout AND to +// `logs/rust__.csv`. Schema: +// +// timestamp,language,op,case_name,result,ns_per_iter +// +// BigInt arithmetic is exact, so there is no precision parameter. +// +// Usage: cargo run --release --quiet -- --op add [--cases-dir DIR] +// [--logs-dir DIR] +// +// Available ops: add, multiply, floor_divide, power, shift, sqrt, +// from_string, to_string. + +use num_bigint::BigInt; +use num_integer::Integer; +use num_traits::Pow; +use serde::Deserialize; +use std::env; +use std::fs; +use std::hint::black_box; +use std::io::Write; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::Instant; + +#[derive(Debug, Deserialize)] +struct Doc { + config: Option, + cases: Vec, +} + +#[derive(Debug, Deserialize)] +struct Config { + iterations: Option, +} + +#[derive(Debug, Deserialize, Clone)] +struct Case { + name: String, + a: String, + #[serde(default)] + b: String, +} + +/// Expand `{C,N}` repeat patterns. `{9,3}` → "999", `1{0,4}2` → "100002". +fn expand(s: &str) -> String { + let mut out = String::new(); + let bytes = s.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if bytes[i] == b'{' { + if let Some(close_rel) = s[i + 1..].find('}') { + let close = i + 1 + close_rel; + let inner = &s[i + 1..close]; + if let Some(comma) = inner.rfind(',') { + let payload = &inner[..comma]; + if let Ok(n) = inner[comma + 1..].parse::() { + for _ in 0..n { + out.push_str(payload); + } + i = close + 1; + continue; + } + } + } + } + out.push(bytes[i] as char); + i += 1; + } + out +} + +// ----- timing config (mirrors the Mojo / Python harnesses) -------------- +const TARGET_NS: u128 = 50_000_000; // 50ms per rep target +const MIN_RES_NS: u128 = 100_000; // 100µs floor per rep for resolution +const MAX_WALL_NS: u128 = 500_000_000; // 500ms total wall per case + +/// Run `iters` reps of `op` on the prepared operands. The result is fed to +/// `black_box` so the optimizer cannot elide the work. +fn run_iters(op: &str, da: &BigInt, db: &BigInt, exp: u32, shift: usize, a: &str, iters: u64) { + match op { + "add" => { + for _ in 0..iters { + black_box(black_box(da) + black_box(db)); + } + } + "multiply" => { + for _ in 0..iters { + black_box(black_box(da) * black_box(db)); + } + } + "floor_divide" => { + for _ in 0..iters { + black_box(black_box(da).div_floor(black_box(db))); + } + } + "power" => { + for _ in 0..iters { + black_box(Pow::pow(black_box(da), black_box(exp))); + } + } + "shift" => { + for _ in 0..iters { + black_box(black_box(da) << black_box(shift)); + } + } + "sqrt" => { + for _ in 0..iters { + black_box(black_box(da).sqrt()); + } + } + "from_string" => { + for _ in 0..iters { + black_box(BigInt::from_str(black_box(a)).expect("parse")); + } + } + "to_string" => { + for _ in 0..iters { + black_box(black_box(da).to_string()); + } + } + other => panic!("unknown op {other}"), + } +} + +fn compute_result(op: &str, da: &BigInt, db: &BigInt, exp: u32, shift: usize, a: &str) -> String { + match op { + "add" => (da + db).to_string(), + "multiply" => (da * db).to_string(), + "floor_divide" => da.div_floor(db).to_string(), + "power" => Pow::pow(da, exp).to_string(), + "shift" => (da << shift).to_string(), + "sqrt" => da.sqrt().to_string(), + "from_string" => BigInt::from_str(a).expect("parse").to_string(), + "to_string" => da.to_string(), + other => panic!("unknown op {other}"), + } +} + +fn run_op(op: &str, a: &str, b: &str, iter_hint: u64) -> (String, f64) { + let da = BigInt::from_str(a).expect("from_str a"); + // `b` is the second operand (add/multiply/floor_divide) or a small + // integer (power exponent / shift count). Unary ops leave it empty. + // Fail fast on a malformed non-empty operand, consistent with `a`, so + // bad case data cannot masquerade as a valid benchmark run. + let db = if b.is_empty() { + BigInt::from(0) + } else { + BigInt::from_str(b).expect("from_str b") + }; + let exp: u32 = if op == "power" { + b.parse::().expect("power exponent") + } else { + 0 + }; + let shift: usize = if op == "shift" { + b.parse::().expect("shift count") + } else { + 0 + }; + + // Displayed result, computed once (string form for cross-lang diff). + let result_str = compute_result(op, &da, &db, exp, shift, a); + + // Calibrate one rep to estimate per-iter cost. + let t0 = Instant::now(); + run_iters(op, &da, &db, exp, shift, a, 1); + let mut cal = t0.elapsed().as_nanos(); + if cal == 0 { + cal = 1; + } + let mut n = TARGET_NS / cal; + let n_min_res = MIN_RES_NS / cal; + if n < n_min_res { + n = n_min_res; + } + if n < 3 { + n = 3; + } + if n > iter_hint as u128 { + n = iter_hint as u128; + } + if n < 1 { + n = 1; + } + let iters = n as u64; + let per_rep = (iters as u128) * cal; + let reps: u32 = if per_rep > 0 { + (MAX_WALL_NS / per_rep).clamp(1, 3) as u32 + } else { + 3 + }; + + // Best-of-N timing. + let mut best = u128::MAX; + for _ in 0..reps { + let t0 = Instant::now(); + run_iters(op, &da, &db, exp, shift, a, iters); + let dt = t0.elapsed().as_nanos(); + if dt < best { + best = dt; + } + } + let per = best as f64 / iters as f64; + (result_str, per) +} + +fn main() { + let mut op = String::from("add"); + let mut cases_dir = PathBuf::from("../cases"); + let mut logs_dir = PathBuf::from("../logs"); + let args: Vec = env::args().collect(); + let mut i = 1; + while i < args.len() { + match args[i].as_str() { + "--op" => { + op = args[i + 1].clone(); + i += 2; + } + "--cases-dir" => { + cases_dir = PathBuf::from(&args[i + 1]); + i += 2; + } + "--logs-dir" => { + logs_dir = PathBuf::from(&args[i + 1]); + i += 2; + } + _ => i += 1, + } + } + + let toml_path = cases_dir.join(format!("{op}.toml")); + let raw = fs::read_to_string(&toml_path) + .unwrap_or_else(|e| panic!("read {}: {e}", toml_path.display())); + let doc: Doc = toml::from_str(&raw).expect("toml parse"); + let iter_hint = doc + .config + .as_ref() + .and_then(|c| c.iterations) + .unwrap_or(1000); + + fs::create_dir_all(&logs_dir).ok(); + let ts = chrono_now(); + let log_path = logs_dir.join(format!("rust_{op}_{ts}.csv")); + let mut log = fs::File::create(&log_path).expect("open log"); + writeln!(log, "timestamp,language,op,case_name,result,ns_per_iter").ok(); + + println!("# num-bigint {} (hint={})", op, iter_hint); + println!("{:<40} {:<32} {:>12}", "case", "result", "ns/iter"); + for case in &doc.cases { + let a = expand(&case.a); + let b = expand(&case.b); + let (result, per) = run_op(&op, &a, &b, iter_hint); + let result_short: String = result.chars().take(30).collect(); + println!("{:<40} {:<32} {:>12.3}", case.name, result_short, per); + writeln!( + log, + "{},rust,{},{},{},{:.4}", + ts, + op, + csv_quote(&case.name), + csv_quote(&result), + per + ) + .ok(); + } + eprintln!("wrote {}", log_path.display()); +} + +fn csv_quote(s: &str) -> String { + if s.contains(',') || s.contains('"') || s.contains('\n') || s.contains('\r') { + format!("\"{}\"", s.replace('"', "\"\"")) + } else { + s.to_string() + } +} + +/// Tiny timestamp without pulling in chrono. Format: YYYYMMDD_HHMMSS (UTC). +fn chrono_now() -> String { + use std::time::SystemTime; + let secs = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + let (y, m, d, hh, mm, ss) = unix_to_ymd_hms(secs); + format!("{:04}{:02}{:02}_{:02}{:02}{:02}", y, m, d, hh, mm, ss) +} + +fn unix_to_ymd_hms(secs: u64) -> (u32, u32, u32, u32, u32, u32) { + let days = secs / 86400; + let rem = secs % 86400; + let hh = (rem / 3600) as u32; + let mm = ((rem % 3600) / 60) as u32; + let ss = (rem % 60) as u32; + + // Civil-from-days algorithm (Howard Hinnant). + let z = days as i64 + 719468; + let era = (if z >= 0 { z } else { z - 146096 }) / 146097; + let doe = (z - era * 146097) as u64; + let yoe = (doe - doe / 1460 + doe / 36524 - doe / 146096) / 365; + let y = yoe as i64 + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = doy - (153 * mp + 2) / 5 + 1; + let m = if mp < 10 { mp + 3 } else { mp - 9 }; + let y = y + (if m <= 2 { 1 } else { 0 }); + (y as u32, m as u32, d as u32, hh, mm, ss) +} diff --git a/benches/decimal128/aggregate.py b/benches/decimal128/aggregate.py index a2518bbf..a0a1c38d 100644 --- a/benches/decimal128/aggregate.py +++ b/benches/decimal128/aggregate.py @@ -8,7 +8,7 @@ 2. Per-op detail tables (rows = test cases, columns = languages) 3. Result-equivalence summary (per-op match rate) -Output goes to /dec128_report_{ts}.md by default. +Output goes to /dec128_report_utc_{ts}.md by default. Usage: python3 aggregate.py --logs-dir logs --reports-dir reports \\ @@ -199,7 +199,7 @@ def main() -> int: ap.add_argument( "--out", default=None, - help="Override report path (default: /dec128_report_.md)", + help="Override report path (default: /dec128_report_utc_.md)", ) args = ap.parse_args() @@ -218,7 +218,7 @@ def main() -> int: else: offset_str = "UTC" header_ts = f"{now_local.strftime('%Y-%m-%d %H:%M:%S')} ({offset_str})" - out_path = args.out or os.path.join(args.reports_dir, f"dec128_report_{ts}.md") + out_path = args.out or os.path.join(args.reports_dir, f"dec128_report_utc_{ts}.md") lang_label = { "mojo": "decimo", diff --git a/benches/run_bench.sh b/benches/run_bench.sh index c7b0ddb1..edba8bf3 100755 --- a/benches/run_bench.sh +++ b/benches/run_bench.sh @@ -22,7 +22,7 @@ if [[ -z "$TYPE" ]]; then echo "Usage: pixi run bench [operation]" echo "" echo "Types:" - echo " bigint (int) BigInt benchmarks (BigInt10 vs BigInt vs Python int)" + echo " bigint (int) BigInt benchmarks (decimo vs Python int vs Rust num-bigint)" echo " biguint (uint) BigUInt benchmarks (BigUInt vs Python int)" echo " decimal128 (dec128) Decimal128 benchmarks (Decimal128 vs Python decimal)" echo " bigdecimal (dec) BigDecimal benchmarks (BigDecimal vs Python decimal)" @@ -91,6 +91,23 @@ if [[ "$TYPE" == "bigdecimal" ]]; then fi fi +# --- bigint: cross-language pipeline (decimo + python + rust) --- +# When OP is empty, run the full op set across all available languages and +# produce a timestamped report under benches/bigint/reports/. The Rust +# (num-bigint) harness is built and run only if `cargo` is on PATH; +# otherwise the pipeline runs with the available languages. When OP is +# given, restrict the run to that single op via --ops. Extra args after OP +# are forwarded verbatim to run_all.sh. +if [[ "$TYPE" == "bigint" ]]; then + shift # drop TYPE + if [[ -z "$OP" ]]; then + exec bash "$DIR/run_all.sh" + else + shift # drop OP + exec bash "$DIR/run_all.sh" --ops "$OP" "$@" + fi +fi + # --- Interactive mode (no operation specified) --- if [[ -z "$OP" ]]; then cd "$DIR" diff --git a/docs/plans/bigint_enhancement.md b/docs/plans/bigint_enhancement.md new file mode 100644 index 00000000..3fbbd852 --- /dev/null +++ b/docs/plans/bigint_enhancement.md @@ -0,0 +1,300 @@ +# BigInt Enhancement Plan + +> **Date**: 2026-06-19 (created) +> **Target**: decimo >=0.9.0 +> **Mojo Version**: >=v1.0.0 +> 子曰:工欲善其事,必先利其器。 + +This document is the single source of truth for the arbitrary-precision +**signed integer** (`decimo.BigInt`, base-2^32) performance & correctness +effort. It supersedes `bigint2_benchmark_analysis.md` (2026-02-20), keeping +only the still-relevant historical work items; the full predecessor is +recoverable from git history. + +## 1. Cross-Language Snapshot + +Scope: **arbitrary-precision** signed integers. `BigInt10` (the legacy +base-10^9 integer) and `BigUInt` are out of scope here. + +| Library | Limb | Mul algorithm tier | Div algorithm | Sqrt | +| ----------------- | ---- | -------------------------- | -------------- | ----------------------- | +| decimo BigInt | 2^32 | School → Kara | Knuth-D → B-Z | Newton → prec-doubling | +| Py `int` (C) | 2^30 | School → Kara | Knuth (school) | prec-doubling (`isqrt`) | +| Rust `num-bigint` | 2^64 | School → Kara → Toom-3 | Knuth (school) | Newton | +| Java `BigInteger` | 2^32 | School → Kara → Toom-3 | Knuth → B-Z | Newton | +| GMP `mpz_t` | 2^64 | School → Kara → Toom → FFT | Newton-recip. | Newton-reciprocal | + +**Coverage.** `decimo.BigInt` already offers a complete integer API: +`+ - * // % **`, `<< >>`, `& | ^ ~`, `sqrt` (integer), `from_string`, +`to_string`, plus `gcd`/`extended_gcd`/`lcm`/`mod_pow`/`mod_inverse` +(`number_theory.mojo`). No API gaps versus Python `int` or Rust +`num-bigint`; the open work is purely performance (but the most difficult :D) + +## 2. Baseline (authoritative, 2026-06-19) + +Cross-language harness `benches/bigint/` — `decimo.BigInt` vs Python `int` +(oracle + timing) vs Rust `num-bigint`. Release build (`-O3 -g0 -D +ASSERT=none`), DCE-guarded (`keep`/`black_box`), best-of-N auto-tuned to +~50ms/case. Median ns/iter; `dm/py` = decimo÷python, `dm/rs` = +decimo÷rust (**< 1.00 = decimo faster**). 230 cases, 100% decimo-vs-Python +agreement. + +| op | dm | py | rs | dm/py | dm/rs | dominant cost | +| ------------ | ---: | ---: | ---: | ----: | ----: | --------------------------------------------- | +| add | 48 | 31 | 41 | 1.5× | 1.2× | small-operand constant overhead | +| multiply | 50 | 42 | 31 | 1.2× | 1.6× | no Toom-3, no SIMD partial products vs Rust | +| floor_divide | 230 | 59 | 41 | 3.9× | 5.6× | **per-call allocations (the worst op)** | +| power | 340 | 160 | 297 | 2.1× | 1.2× | square-and-multiply per-op overhead | +| shift | 42 | 39 | 26 | 1.1× | 1.6× | result-buffer allocation | +| sqrt | 580 | 373 | 564 | 1.5× | 1.0× | medium-size division overhead | +| from_string | 990 | 786 | 407 | 1.3× | 2.4× | base-10 → base-2^32 conversion (O(n²) medium) | +| to_string | 893 | 266 | 350 | 3.4× | 2.6× | O(n²) repeated `/10^9` at 50–1000 digits | + +> **Methodology note.** These numbers replace the 2026-02-20 per-op +> figures, which reported decimo *faster* than Python. That harness threw +> the result away (`_ = a + b`) with no DCE guard, timed a single pass, and +> ran on different hardware, so its ratios are not comparable. Treat the +> 2026-06-19 figures as the baseline. They fluctuate ±10–20% run to run. + +## 3. Change History — Done + +Condensed from `bigint2_benchmark_analysis.md` (v0.8.0 effort). All items +verified and merged; kept here as the algorithmic record. + +| Tag | Item | +| ---- | ------------------------------------------------------------------------------------------------ | +| PR0 | sqrt correctness: overestimate-seeded Newton + CPython precision-doubling (was wrong ≥1000 d) | +| PR1 | Karatsuba multiply (`CUTOFF_KARATSUBA = 48` words); slice-based, offset assembly, ptr loops | +| PR2 | Slice-based Burnikel-Ziegler divide (`CUTOFF_BURNIKEL_ZIEGLER = 64`); ≤4-word divisor fast paths | +| PR3 | Divide-and-conquer `to_string` base conversion (entry ≥128 words, leverages B-Z) | +| PR4a | SIMD `parse_numeric_string` (two-pass, `vectorize[16]` digit extraction) | +| PR4b | D&C `from_string` base conversion (entry > 10000 digits) | +| PR4c | `from_string` micro-opts (≤9/≤19-digit fast paths, pre-alloc, raw ptrs, balanced split) | +| PR4d | `to_string` micro-opts (1-/2-word fast paths, `InlineArray` byte buffer, raw ptrs) | +| PR5 | True in-place arithmetic for all 11 `__i*__` dunders (`add_inplace`, …) | +| PR6 | Bitwise AND / OR / XOR / NOT with two's-complement semantics | +| PR7 | `gcd`, `extended_gcd`, `lcm`, `mod_pow`, `mod_inverse` (`number_theory.mojo`) | +| PR8 | `BInt`/`BigInt` alias bound to the base-2^32 type (legacy → `BigInt10`) | + +## 4. Lessons Learnt + +Items 1–3 are BigInt-specific. Items 4–9 transfer from +`bigdecimal_enhancement.md §4` / `decimal128_enhancement.md` — they hold +unchanged for the variable-length signed case. + +1. **Newton sqrt must converge from above.** An underestimate seed lets + Newton settle on the wrong quadratic residue at ≥1000 digits (PR0). Seed + with a ceiling-rounded hardware sqrt of the top words; for huge inputs use + CPython precision-doubling — total work O(M(n)), not O(M(n)·log n). + +2. **Base-2^32 carries are shift/mask, not division — so the BigDecimal + "deferred-carry / Comba" multiply win does NOT transfer.** The base-10^9 + Comba trick (T-9) existed to amortise the `% 10^9` / `/ 10^9` on every + inner-product step. In base-2^32 the carry is already `>> 32` + `& + 0xFFFFFFFF` (no divide), so the multiply gap versus Rust is **Toom-3 + + SIMD partial-product accumulation**, not carry amortisation. Measure + before porting any base-10^9 micro-opt. + +3. **Slice-based recursion is mandatory for B-Z.** The first copy-based + Burnikel-Ziegler regressed (excess `List[UInt32]` allocation per level); + passing `(list, start, end)` bounds and materialising only at the Knuth-D + base case is what made it a net win (PR2). Any new D&C kernel must follow + the same no-copy-until-base-case discipline. + + 3a. **A cross-language gap is usually decimo's own overhead, not the + limb width.** When I first saw the 5.6× Rust floor_divide gap I blamed + the representation (decimo is base-2^32, num-bigint is base-2^64). The + benchmark says otherwise. Python uses base-2^30 limbs, even narrower + than decimo's, and still divides 3.9× faster; sqrt is multiply- and + divide-heavy yet already at parity with Rust. So a wider limb is not why + decimo trails. Look for the real cost first: redundant copies, per-call + allocations, branches in the inner loop. Only reach for the limb width + once those are gone (T-W1). + +4. **`debug_assert` does NOT lazy-evaluate its message** under `-D + ASSERT=none`; a `String.format(...)` argument still allocates in the hot + loop. Use plain string literals (or the variadic `debug_assert(cond, + "msg ", value)` form) only. + +5. **Hot path first.** The count of branches *before* the fast arm matters + more than the fast arm's body. Route rare cases (zero operand, sign + mismatch, differing length) to a cold tail of the same function. + +6. **`@no_inline` the body of every `raise … .format(...)` helper** so + `@always_inline` can fire on the parent and icache pressure at inline + raise edges drops. + +7. **Hoist a raw data pointer in multi-buffer / O(n²) inner loops.** A + `List[i]` access reloads `List._data` every element. Hoisting + (`var p = lst.unsafe_ptr()`) is a stable win when an iteration touches + ≥2 buffers or is an O(n²) inner loop; it does **not** clear the ~3% bar + for single-buffer single-pass O(n) loops that are arithmetic-bound. + Safety: the buffer must not be resized while the pointer is live. + +8. **Precision doubling is the lever for Newton-style methods** (sqrt, and + any future reciprocal divide): start small and double, total work ≈ 3× + the final iteration instead of `log n` full-width iterations. + +9. **Reciprocal-Newton divide only wins once multiplication is much + cheaper than division** (the NTT regime). With Karatsuba, a B-Z divide is + ~2–3× a same-size multiply, so a reciprocal-Newton rewrite would be a + *regression* today — gate it behind Toom-3/NTT (T-M1). + +## 5. Open Items + +Worked in priority order. There is one real outlier, floor_divide; the +rest are smaller. The limb-width question sits at the end, because the +benchmark shows it is not why decimo trails today. + +### floor_divide / truncate_divide — the outlier (3.9× py, 5.6× rs) + +floor_divide is the only op that trails badly. add and multiply are within +1.2–1.5×, but divide is 3.9× Python and 5.6× Rust. Two facts rule out the +easy explanations: + +- It is not the algorithm. decimo and num-bigint both run Knuth Algorithm D + below the Burnikel-Ziegler cutoff (64 words). +- It is not the limb width. Python's `int` uses base-2^30 limbs, narrower + than decimo's base-2^32, and still divides 3.9× faster. A wider limb + would help Python, not decimo. + +The cause is decimo's per-call overhead on small and medium operands. A +small divide such as `-7 // 2` does three or four heap allocations in +decimo and almost none in Python or Rust: + +- `floor_divide` copies the quotient and remainder out of the divmod tuple + with `.copy()`, two allocations it does not need, then allocates a third + time through `_add_magnitudes(q, 1)` on the negative-floor branch. +- `_divmod_magnitudes` normalises both operands with `_shift_left_words` on + every multi-word call, two more allocations, even when the operands are + tiny. +- The Knuth-D inner loop recomputes `len(u)` and re-checks `idx < len(u)` + on every step and takes a branchy manual borrow. num-bigint walks a slice + with a branchless offset-carry. + +**T-D1 — remove the redundant allocations.** Move the quotient and +remainder out of the divmod tuple instead of copying them. Increment in +place on the negative-floor branch. Fold the Knuth-D normalisation shift +into the base case so it stops allocating two fresh buffers per call. + +**T-D2 — tighten the inner loop.** Hoist `len(u)` and the `u`/`v` data +pointers out of the multiply-subtract loop (Lesson 7, two buffers) and +replace the manual borrow with num-bigint's branchless offset-carry. + +**T-D3 — re-tune `CUTOFF_BURNIKEL_ZIEGLER`.** Re-measure 32/48/64 once the +base case is cheaper. The 2n-by-n / 4n-by-n / 8n-by-n slowdown already +noted for `BigUInt` in `todo.md` may share this root and should be checked +together. + +**T-D4 — reciprocal / Barrett divide. Deferred.** Not worth it before +Toom-3 (Lesson 9). + +### to_string, 50–1000 digits (3.4× py) + +The 1- and 2-word fast paths and the D&C path above 128 words are done. The +50–1000-digit band still runs the O(n²) simple path of repeated division by +10^9, with the `InlineArray` chunked emit already in place. + +**T-T1 — lower the D&C entry threshold** once divide is cheaper (D&C is +gated on divide cost). Re-measure entry = 64 / 96. + +**T-T2 — wider radix per chunk.** Batching the repeated `/10^9` into a +larger radix only helps if it avoids the software-emulated 128-bit divide. +PR4d rejected 10^18 chunks for exactly this reason; re-verify on current +hardware before trying again. + +### multiply (1.2× py, 1.6× rs) + +decimo stops at Karatsuba; num-bigint adds Toom-3, which is what makes it +1.6× faster on large operands. + +**T-M1 — Toom-3 multiplication.** Add a Toom-3 tier above ~256 words, the +same cutoff ratio used in BigDecimal / BigUInt. + +**T-M2 — SIMD partial-product accumulation** in the schoolbook base case +(NEON 4×UInt32), which is the base for both Karatsuba and a future Toom-3. +This is the base-2^32 analogue of the BigDecimal multiply win; note that +the base-10^9 Comba trick itself does not transfer (Lesson 2). + +### power (2.1× py) and add (1.5× py) + +**T-P1 — power inner loop.** General (non-2^N) power pays for a fresh +temporary on every multiply. Route the loop through `multiply_inplace` and +add a dedicated `square()` that exploits symmetry, roughly half the partial +products. The 2^N shift fast path is already excellent; Rust loses to it. + +**T-A1 — add/sub dispatch.** SIMD add is already in place, so the +small-operand gap is dispatch, not the kernel. Put the same-length +same-sign case first (Lesson 5) and check for any stray +`debug_assert .format` (Lesson 4). + +### from_string and shift + +**T-F1 — from_string base conversion.** The 50–10000-digit band runs an +O(n²) base-10 → base-2^32 conversion. Lower the D&C entry threshold once +multiply is faster; the 20000-digit-and-up gap only closes with Toom-3 +(T-M1). + +**T-SH1 — shift allocation.** Extreme shifts such as `1 << 100000` are +allocation-bound. Pre-size the result buffer with +`resize(unsafe_uninit_length=…)` (O(1) capacity plus memset) instead of +letting it grow. + +### A bigger bet: base-2^64 limbs (unproven, not the first lever) + +num-bigint stores base-2^64 limbs on 64-bit targets; decimo stores +base-2^32. For the same number num-bigint holds half the limbs, so its +schoolbook multiply and Knuth-D base case run over half the words. This is +worth keeping in mind for the large-operand multiply and from_string cases. + +It is not why decimo trails today, and I want to be clear about that. sqrt +is multiply- and divide-heavy yet already sits at parity with Rust (1.0×), +and Python beats decimo at divide with even narrower limbs. So I treat a +wider limb as a later, large bet, and only after the per-call overhead +above is gone. + +Feasibility (probed 2026-06-19, Mojo v1.0.0b1). The Rust `cfg_digit!` idea +ports. Mojo rejects a ternary on the types themselves +(`UInt64 if is_64bit() else UInt32`), but a ternary on `DType` values is +accepted, so one comptime block selects the limb per target: + +```mojo +comptime BASE_DT: DType = DType.uint64 if is_64bit() else DType.uint32 +comptime DOUBLE_DT: DType = DType.uint128 if is_64bit() else DType.uint64 +comptime BigBase = Scalar[BASE_DT] # UInt64 on 64-bit +comptime DoubleBigBase = Scalar[DOUBLE_DT] # UInt128 on 64-bit +comptime BITS: Int = 64 if is_64bit() else 32 +``` + +`UInt128` `*`, `//`, `%`, `>>`, `&` all compute correctly; the `u128 ÷ u64` +divide is software-emulated on arm64 but gives the right answer, same as +num-bigint. The aliasing is trivial. The migration is not: base-2^32 is +hard-coded across `src/decimo/bigint/` — the `List[UInt32]` field and every +signature, the `1 << 32` / `0xFFFF_FFFF` / `>> 32` literals, the 4×UInt32 +NEON width, `_count_leading_zeros`, the base-10 ↔ base-2^k chunking in +`from_string` / `to_string` (9 vs 19 digits per limb, the hard part), and +`BigInt10` bit-layout interop. If I do it, I will first introduce +`BigBase` / `DoubleBigBase` / `BITS` / `BASE` / `MASK` and replace every +literal while keeping the limb at uint32, a pure and testable refactor with +no behaviour change, then flip to uint64 and fix the base-conversion and +SIMD fallout behind the test suite. + +**T-W1 — base-2^64 limbs. Open, low priority, unproven.** + +### Plan + +| Label | Item | Status | +| ----- | ------------------------------------------------ | ------------------------------------- | +| T-D1 | Remove redundant `.copy()` / | OPEN — the floor_divide outlier (P0) | +| | normalise allocs in divide | | +| T-D2 | Hoist Knuth-D inner loop; | OPEN — Lesson 7 (two buffers) | +| | branchless offset-carry | | +| T-D3 | Re-tune `CUTOFF_BURNIKEL_ZIEGLER` | OPEN — pair with the BigUInt todo | +| T-T1 | Lower to_string D&C entry threshold | OPEN — after T-D1 / T-D2 | +| T-M1 | Toom-3 multiply above ~256 words | OPEN — unblocks multiply, from_string | +| T-M2 | SIMD partial-product accumulation in school base | OPEN | +| T-P1 | `square()` plus inplace loop for power | OPEN | +| T-A1 | add/sub dispatch reorder | OPEN | +| T-SH1 | Pre-size the shift result buffer | OPEN | +| T-W1 | Base-2^64 limbs | OPEN — unproven, low priority | +| T-D4 | Reciprocal-Newton divide | DEFERRED — needs Toom-3 (Lesson 9) | diff --git a/pixi.toml b/pixi.toml index f59fe203..8de97ec2 100644 --- a/pixi.toml +++ b/pixi.toml @@ -64,7 +64,7 @@ tf = "clear && pixi run testfloat" # quick build & run for a single BigFloat-using .mojo file # Usage: pixi run bf path/to/file.mojo -bf = { cmd = "bash examples/run_bigfloat.sh", description = "Build & run a single BigFloat .mojo file (output binary in temp/)" } +bf = {cmd = "bash examples/run_bigfloat.sh", description = "Build & run a single BigFloat .mojo file (output binary in temp/)"} # bench bench = "pixi run package && bash benches/run_bench.sh" @@ -73,8 +73,8 @@ bench = "pixi run package && bash benches/run_bench.sh" bdec_debug = """clear && pixi run package && cd benches/bigdecimal \ &&pixi run mojo run -I ../ -D ASSERT=all bench.mojo && cd ../.. \ &&pixi run clean""" -bint_debug = """clear && pixi run package && cd benches/bigint \ -&&pixi run mojo run -I ../ -D ASSERT=all bench.mojo && cd ../.. \ +bint_debug = """clear && pixi run package && cd benches/bigint/mojo \ +&&pixi run mojo run -I ../../../src -D ASSERT=all bench.mojo --op add && cd ../../.. \ &&pixi run clean""" buint_debug = """clear && pixi run package && cd benches/biguint \ &&pixi run mojo run -I ../ -D ASSERT=all bench.mojo && cd ../.. \