diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index d3d8a03..fb004e3 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -29,12 +29,37 @@ permissions: jobs: benchmark: - name: "Benchmark" + name: "Benchmark (${{ matrix.shard }})" runs-on: ubuntu-latest - timeout-minutes: 15 + timeout-minutes: 5 permissions: contents: read id-token: write + strategy: + fail-fast: false + matrix: + include: + # CI tests only (not slow) for fast PR feedback + # Slow tests excluded - they take ~24s each vs ~7s for CI tests + - shard: "load-reload" + filter: "TestBenchLoad or TestBenchReload" + marker: "benchmark and not slow" + - shard: "get" + filter: "TestBenchGet" + marker: "benchmark and not slow" + - shard: "find" + filter: "TestBenchFind and not TestBenchFindOne" + marker: "benchmark and not slow" + - shard: "find-one-delete" + filter: "TestBenchFindOne or TestBenchDelete" + - shard: "write-compact" + filter: "TestBenchPut or TestBenchBatchWrite or TestBenchCompact" + - shard: "all-keys-items" + filter: "TestBenchAll or TestBenchKeys or TestBenchItems" + marker: "benchmark and not slow" + - shard: "count-has" + filter: "TestBenchCount or TestBenchHas" + marker: "benchmark and not slow" steps: - name: Checkout code @@ -50,4 +75,4 @@ jobs: uses: CodSpeedHQ/action@346a2d8a8d9d38909abd0bc3d23f773110f076ad # v4.4.1 with: mode: simulation - run: uv run pytest -m benchmark --codspeed + run: uv run pytest -m "${{ matrix.marker || 'benchmark' }}" -k "${{ matrix.filter }}" --codspeed diff --git a/pyproject.toml b/pyproject.toml index d52857c..a563e89 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "uv_build" [project] name = "jsonlt-python" -version = "0.1.0a2" +version = "0.1.0a3" description = "Reference implementation of the JSONLT (JSON Lines Table) specification for Python." readme = "README.md" license = "MIT" @@ -99,6 +99,7 @@ markers = [ "conformance: mark a test as a conformance test.", "example: mark a test as a documentation example test.", "fuzz: mark a test as a fuzz test.", + "limit_memory: mark a test with memory limit (pytest-memray marker).", "slow: mark a test as slow (excluded by default, run with -m slow).", "integration: mark a test as an integration test.", "property: mark a test as a property test.", diff --git a/tests/benchmarks/_generators.py b/tests/benchmarks/_generators.py new file mode 100644 index 0000000..bcadf2f --- /dev/null +++ b/tests/benchmarks/_generators.py @@ -0,0 +1,427 @@ +"""Benchmark data generators for JSONLT benchmarks. + +This module provides deterministic data generation functions for benchmark +tests. All generators use seeded random instances for reproducibility. +""" + +import random +from typing import TYPE_CHECKING, Literal + +from jsonlt import Table +from jsonlt._header import Header, serialize_header +from jsonlt._json import JSONObject, serialize_json + +if TYPE_CHECKING: + from pathlib import Path + + from jsonlt._keys import Key, KeySpecifier + + +def generate_key( + key_type: Literal["string", "integer", "tuple"], + index: int, +) -> "Key": + """Generate a deterministic key for a given index. + + Args: + key_type: The type of key to generate. + index: The index used to generate the key value. + + Returns: + A key of the specified type. + """ + if key_type == "string": + return f"key_{index:08d}" + if key_type == "integer": + return index + # tuple: distribute across 10 orgs + org_index = index % 10 + return (f"org_{org_index}", index) + + +def get_key_specifier( + key_type: Literal["string", "integer", "tuple"], +) -> "KeySpecifier": + """Get the key specifier for a given key type. + + Args: + key_type: The type of key. + + Returns: + The corresponding key specifier. + """ + if key_type == "tuple": + return ("org", "seq") + return "id" + + +def _generate_small_record( + key_type: Literal["string", "integer", "tuple"], + index: int, + rng: random.Random, +) -> JSONObject: + """Generate a small record (~5 fields). + + Args: + key_type: The type of key to use. + index: The index for deterministic key generation. + rng: Random instance for deterministic value generation. + + Returns: + A JSONObject with ~5 fields. + """ + record: JSONObject = {} + + # Add key field(s) + if key_type == "tuple": + org_index = index % 10 + record["org"] = f"org_{org_index}" + record["seq"] = index + elif key_type == "integer": + record["id"] = index + else: + record["id"] = f"key_{index:08d}" + + # Add additional fields + record["name"] = f"Record {index}" + record["active"] = rng.choice([True, False]) + record["count"] = rng.randint(0, 10000) + record["score"] = round(rng.uniform(0.0, 100.0), 2) + + return record + + +def _generate_medium_record( + key_type: Literal["string", "integer", "tuple"], + index: int, + rng: random.Random, +) -> JSONObject: + """Generate a medium record (~20 fields). + + Args: + key_type: The type of key to use. + index: The index for deterministic key generation. + rng: Random instance for deterministic value generation. + + Returns: + A JSONObject with ~20 fields. + """ + record = _generate_small_record(key_type, index, rng) + + # Add more fields to reach ~20 total + record["description"] = f"This is a detailed description for record {index}." + record["tags"] = [f"tag_{rng.randint(1, 100)}" for _ in range(5)] + record["address"] = { + "street": f"{rng.randint(1, 9999)} Main St", + "city": rng.choice( + ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"] + ), + "state": rng.choice(["NY", "CA", "IL", "TX", "AZ"]), + "zip": f"{rng.randint(10000, 99999)}", + } + record["created_at"] = f"2024-{rng.randint(1, 12):02d}-{rng.randint(1, 28):02d}" + record["updated_at"] = f"2024-{rng.randint(1, 12):02d}-{rng.randint(1, 28):02d}" + record["priority"] = rng.randint(1, 5) + record["category"] = rng.choice(["A", "B", "C", "D", "E"]) + record["status"] = rng.choice(["pending", "active", "completed", "archived"]) + record["version"] = rng.randint(1, 100) + record["weight"] = round(rng.uniform(0.1, 100.0), 3) + record["rating"] = round(rng.uniform(1.0, 5.0), 1) + record["views"] = rng.randint(0, 1000000) + record["likes"] = rng.randint(0, 100000) + + return record + + +def _generate_large_record( + key_type: Literal["string", "integer", "tuple"], + index: int, + rng: random.Random, +) -> JSONObject: + """Generate a large record (~100 fields with 1KB+ text blobs). + + Args: + key_type: The type of key to use. + index: The index for deterministic key generation. + rng: Random instance for deterministic value generation. + + Returns: + A JSONObject with ~100 fields including large text blobs. + """ + record = _generate_medium_record(key_type, index, rng) + + # Generate large text blobs (1KB+ each) + words = [ + "lorem", + "ipsum", + "dolor", + "sit", + "amet", + "consectetur", + "adipiscing", + "elit", + "sed", + "do", + "eiusmod", + "tempor", + "incididunt", + "ut", + "labore", + "et", + "dolore", + "magna", + "aliqua", + "enim", + ] + + def generate_blob(min_chars: int) -> str: + result: list[str] = [] + current_len = 0 + while current_len < min_chars: + word = rng.choice(words) + result.append(word) + current_len += len(word) + 1 + return " ".join(result) + + # Add large text blobs (these make up most of the record size) + record["long_description"] = generate_blob(1024) + record["notes"] = generate_blob(1024) + record["content"] = generate_blob(2048) + + # Add many additional fields to reach ~100 total + for i in range(80): + field_name = f"field_{i:02d}" + field_type = i % 5 + if field_type == 0: + record[field_name] = f"value_{rng.randint(1, 10000)}" + elif field_type == 1: + record[field_name] = rng.randint(0, 1000000) + elif field_type == 2: + record[field_name] = round(rng.uniform(0.0, 1000.0), 4) + elif field_type == 3: + record[field_name] = rng.choice([True, False]) + else: + record[field_name] = [rng.randint(1, 100) for _ in range(3)] + + return record + + +def generate_record( + key_type: Literal["string", "integer", "tuple"], + size: Literal["small", "medium", "large"], + index: int, + *, + seed: int = 42, +) -> JSONObject: + """Generate a single deterministic record. + + Args: + key_type: The type of key to use ("string", "integer", or "tuple"). + size: The size of record to generate ("small", "medium", or "large"). + index: The index for deterministic generation. + seed: Random seed for reproducibility. + + Returns: + A JSONObject of the specified size and key type. + """ + rng = random.Random(seed + index) # noqa: S311 + + if size == "small": + return _generate_small_record(key_type, index, rng) + if size == "medium": + return _generate_medium_record(key_type, index, rng) + return _generate_large_record(key_type, index, rng) + + +def generate_records( + key_type: Literal["string", "integer", "tuple"], + size: Literal["small", "medium", "large"], + count: int, + *, + seed: int = 42, +) -> list[JSONObject]: + """Generate a list of deterministic records. + + Args: + key_type: The type of key to use ("string", "integer", or "tuple"). + size: The size of records to generate ("small", "medium", or "large"). + count: Number of records to generate. + seed: Random seed for reproducibility. + + Returns: + A list of JSONObjects of the specified size and key type. + """ + return [generate_record(key_type, size, i, seed=seed) for i in range(count)] + + +def write_table_file( + path: "Path", + records: list[JSONObject], + key_specifier: "KeySpecifier", +) -> None: + """Write records to a JSONLT file. + + This creates a new file with a header and all records serialized. + + Args: + path: Path to write the file to. + records: List of records to write. + key_specifier: The key specifier for the table. + """ + lines: list[str] = [] + + # Add header + header = Header(version=1, key=key_specifier) + lines.append(serialize_header(header)) + + # Add records + lines.extend(serialize_json(record) for record in records) + + # Write to file + content = "\n".join(lines) + "\n" + _ = path.write_text(content, encoding="utf-8") + + +def create_test_table( + tmp_path: "Path", + key_type: Literal["string", "integer", "tuple"], + record_size: Literal["small", "medium", "large"], + scale: int, + *, + auto_reload: bool = False, +) -> Table: + """Create a test table with generated records. + + Args: + tmp_path: pytest tmp_path fixture. + key_type: Type of keys to generate. + record_size: Size of records to generate. + scale: Number of records to generate. + auto_reload: Whether to enable auto-reload on the table. + + Returns: + A Table instance with the generated records. + """ + records = generate_records(key_type, record_size, scale) + key_spec = get_key_specifier(key_type) + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + return Table(file_path, key=key_spec, auto_reload=auto_reload) + + +def add_history_to_table( + table: Table, + key_type: Literal["string", "integer", "tuple"], + record_size: Literal["small", "medium", "large"], + count: int, +) -> None: + """Add update history to a table. + + Args: + table: The table to add history to. + key_type: Type of keys. + record_size: Size of records. + count: Number of updates to add. + """ + for i in range(count): + updated_record = generate_record(key_type, record_size, i, seed=99) + table.put(updated_record) + + +def create_extended_test_table( # noqa: PLR0913 + tmp_path: "Path", + key_type: Literal["string", "integer", "tuple"], + record_size: Literal["small", "medium", "large"], + base_scale: int, + extra_keys: int, + *, + auto_reload: bool = False, +) -> Table: + """Create a test table with additional keys beyond base scale. + + The table contains keys from index 0 to (base_scale + extra_keys - 1). + Use this for delete benchmarks where each iteration needs a unique key. + + Args: + tmp_path: pytest tmp_path fixture. + key_type: Type of keys to generate. + record_size: Size of records to generate. + base_scale: Base number of records. + extra_keys: Additional keys for benchmark iterations. + auto_reload: Whether to enable auto-reload on the table. + + Returns: + A Table instance with the generated records. + """ + total_count = base_scale + extra_keys + records = generate_records(key_type, record_size, total_count) + key_spec = get_key_specifier(key_type) + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + return Table(file_path, key=key_spec, auto_reload=auto_reload) + + +def create_table_with_history( # noqa: PLR0913 + tmp_path: "Path", + key_type: Literal["string", "integer", "tuple"], + record_size: Literal["small", "medium", "large"], + scale: int, + history_count: int, + *, + auto_reload: bool = False, +) -> Table: + """Create a table pre-populated with update history. + + Creates a table with `scale` records, then applies `history_count` + updates to existing records. Useful for compact benchmarks. + + Args: + tmp_path: pytest tmp_path fixture. + key_type: Type of keys to generate. + record_size: Size of records to generate. + scale: Number of records to generate. + history_count: Number of updates to apply. + auto_reload: Whether to enable auto-reload on the table. + + Returns: + A Table instance with the generated records and history. + """ + table = create_test_table( + tmp_path, key_type, record_size, scale, auto_reload=auto_reload + ) + add_history_to_table(table, key_type, record_size, history_count) + return table + + +def create_table_with_tombstones( # noqa: PLR0913 + tmp_path: "Path", + key_type: Literal["string", "integer", "tuple"], + record_size: Literal["small", "medium", "large"], + scale: int, + tombstone_count: int, + *, + auto_reload: bool = False, +) -> Table: + """Create a table pre-populated with tombstones. + + Creates a table with `scale` records, then deletes `tombstone_count` + records starting from index 0. Useful for compact benchmarks. + + Args: + tmp_path: pytest tmp_path fixture. + key_type: Type of keys to generate. + record_size: Size of records to generate. + scale: Number of records to generate. + tombstone_count: Number of records to delete. + auto_reload: Whether to enable auto-reload on the table. + + Returns: + A Table instance with tombstones. + """ + table = create_test_table( + tmp_path, key_type, record_size, scale, auto_reload=auto_reload + ) + for i in range(tombstone_count): + key = generate_key(key_type, i) + _ = table.delete(key) + return table diff --git a/tests/benchmarks/test_bench_memory.py b/tests/benchmarks/test_bench_memory.py new file mode 100644 index 0000000..034f2e0 --- /dev/null +++ b/tests/benchmarks/test_bench_memory.py @@ -0,0 +1,246 @@ +"""Memory profiling benchmarks for JSONLT. + +This module contains memory usage benchmarks using pytest-memray to ensure +memory consumption stays within expected bounds. +""" + +import sys +from typing import TYPE_CHECKING + +import pytest + +from jsonlt import Table + +from ._generators import generate_records, get_key_specifier, write_table_file + +if TYPE_CHECKING: + from pathlib import Path + + from jsonlt._json import JSONObject + +# Skip entire module on Windows (memray not available) +pytestmark = pytest.mark.skipif( + sys.platform == "win32", + reason="pytest-memray not available on Windows", +) + + +class TestMemoryLoad: + """Memory benchmarks for loading tables.""" + + @pytest.mark.limit_memory("10 MB") + def test_load_1k_small_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("50 MB") + @pytest.mark.slow + def test_load_10k_small_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 10000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("500 MB") + @pytest.mark.slow + def test_load_100k_small_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 100000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("20 MB") + def test_load_1k_medium_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "medium", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("100 MB") + @pytest.mark.slow + def test_load_10k_medium_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "medium", 10000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("100 MB") + @pytest.mark.slow + def test_load_1k_large_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "large", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + +class TestMemoryLoadKeyTypes: + """Memory benchmarks for loading with different key types.""" + + @pytest.mark.limit_memory("10 MB") + def test_load_1k_integer_keys(self, tmp_path: "Path") -> None: + records = generate_records("integer", "small", 1000) + key_spec = get_key_specifier("integer") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("10 MB") + def test_load_1k_tuple_keys(self, tmp_path: "Path") -> None: + records = generate_records("tuple", "small", 1000) + key_spec = get_key_specifier("tuple") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("50 MB") + @pytest.mark.slow + def test_load_10k_integer_keys(self, tmp_path: "Path") -> None: + records = generate_records("integer", "small", 10000) + key_spec = get_key_specifier("integer") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + @pytest.mark.limit_memory("50 MB") + @pytest.mark.slow + def test_load_10k_tuple_keys(self, tmp_path: "Path") -> None: + records = generate_records("tuple", "small", 10000) + key_spec = get_key_specifier("tuple") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + _ = Table(file_path, key=key_spec, auto_reload=False) + + +class TestMemoryRead: + """Memory benchmarks for read operations.""" + + @pytest.mark.limit_memory("15 MB") + def test_all_1k_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + _ = table.all() + + @pytest.mark.limit_memory("75 MB") + @pytest.mark.slow + def test_all_10k_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 10000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + _ = table.all() + + @pytest.mark.limit_memory("15 MB") + def test_find_1k_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + _ = table.find(lambda r: r.get("active") is True) + + @pytest.mark.limit_memory("15 MB") + def test_keys_1k_records(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + _ = table.keys() + + +class TestMemoryWrite: + """Memory benchmarks for write operations.""" + + @pytest.mark.limit_memory("15 MB") + def test_put_to_1k_table(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + new_record: JSONObject = { + "id": "new_key", + "name": "New Record", + "active": True, + "count": 1, + "score": 1.0, + } + table.put(new_record) + + @pytest.mark.limit_memory("15 MB") + def test_delete_from_1k_table(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + _ = table.delete("key_00000000") + + @pytest.mark.limit_memory("20 MB") + def test_compact_1k_table(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 1000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + # Add some updates to create history + for i in range(100): + updated: JSONObject = { + "id": f"key_{i:08d}", + "name": f"Updated {i}", + "active": True, + "count": i, + "score": float(i), + } + table.put(updated) + table.compact() + + @pytest.mark.limit_memory("100 MB") + @pytest.mark.slow + def test_compact_10k_table(self, tmp_path: "Path") -> None: + records = generate_records("string", "small", 10000) + key_spec = get_key_specifier("string") + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + table = Table(file_path, key=key_spec, auto_reload=False) + # Add some updates to create history + for i in range(1000): + updated: JSONObject = { + "id": f"key_{i:08d}", + "name": f"Updated {i}", + "active": True, + "count": i, + "score": float(i), + } + table.put(updated) + table.compact() diff --git a/tests/benchmarks/test_bench_table.py b/tests/benchmarks/test_bench_table.py new file mode 100644 index 0000000..b649aec --- /dev/null +++ b/tests/benchmarks/test_bench_table.py @@ -0,0 +1,661 @@ +"""Benchmarks for Table operations. + +This module contains performance benchmarks for all Table operations, +organized by operation type and parametrized by record size, key type, +and scale. +""" + +from typing import TYPE_CHECKING, Literal + +import pytest + +from jsonlt import Table + +from ._generators import ( + create_extended_test_table, + create_table_with_history, + create_table_with_tombstones, + create_test_table, + generate_key, + generate_record, + generate_records, + get_key_specifier, + write_table_file, +) + +if TYPE_CHECKING: + from pathlib import Path + + from pytest_codspeed.plugin import BenchmarkFixture + + from jsonlt._json import JSONObject + + +# Type aliases for parametrization +RecordSize = Literal["small", "medium", "large"] +KeyType = Literal["string", "integer", "tuple"] + +# Scale and size parameters for CI (fast benchmarks) +CI_PARAMS: list[object] = [ + pytest.param("small", "string", 100, id="small-str-100"), + pytest.param("small", "string", 1000, id="small-str-1k"), + pytest.param("small", "integer", 100, id="small-int-100"), + pytest.param("small", "integer", 1000, id="small-int-1k"), + pytest.param("small", "tuple", 100, id="small-tuple-100"), + pytest.param("small", "tuple", 1000, id="small-tuple-1k"), +] + +# Larger scale parameters (marked slow) +SLOW_PARAMS: list[object] = [ + pytest.param("small", "string", 10000, id="small-str-10k", marks=pytest.mark.slow), + pytest.param( + "small", "string", 100000, id="small-str-100k", marks=pytest.mark.slow + ), + pytest.param("medium", "string", 1000, id="med-str-1k", marks=pytest.mark.slow), + pytest.param("medium", "string", 10000, id="med-str-10k", marks=pytest.mark.slow), + pytest.param("large", "string", 1000, id="large-str-1k", marks=pytest.mark.slow), + pytest.param("small", "integer", 10000, id="small-int-10k", marks=pytest.mark.slow), + pytest.param( + "small", "integer", 100000, id="small-int-100k", marks=pytest.mark.slow + ), + pytest.param("small", "tuple", 10000, id="small-tuple-10k", marks=pytest.mark.slow), + pytest.param( + "small", "tuple", 100000, id="small-tuple-100k", marks=pytest.mark.slow + ), + pytest.param("medium", "integer", 1000, id="med-int-1k", marks=pytest.mark.slow), + pytest.param("medium", "tuple", 1000, id="med-tuple-1k", marks=pytest.mark.slow), + pytest.param("large", "integer", 1000, id="large-int-1k", marks=pytest.mark.slow), + pytest.param("large", "tuple", 1000, id="large-tuple-1k", marks=pytest.mark.slow), +] + +# Edge case parameters for boundary testing +EDGE_PARAMS: list[object] = [ + pytest.param("small", "string", 0, id="small-str-0"), + pytest.param("small", "string", 1, id="small-str-1"), + pytest.param("small", "integer", 0, id="small-int-0"), + pytest.param("small", "integer", 1, id="small-int-1"), + pytest.param("small", "tuple", 0, id="small-tuple-0"), + pytest.param("small", "tuple", 1, id="small-tuple-1"), +] + +ALL_PARAMS: list[object] = CI_PARAMS + SLOW_PARAMS +ALL_WITH_EDGE_PARAMS: list[object] = ALL_PARAMS + EDGE_PARAMS + +# Buffer size for delete benchmarks to ensure unique keys per iteration +DELETE_ITERATION_BUFFER: int = 10000 + + +class TestBenchLoad: + """Benchmarks for Table constructor loading pre-existing files.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_PARAMS) + def test_load( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + records = generate_records(key_type, record_size, scale) + key_spec = get_key_specifier(key_type) + file_path = tmp_path / "bench.jsonlt" + write_table_file(file_path, records, key_spec) + + def load_table() -> None: + _ = Table(file_path, key=key_spec, auto_reload=False) + + benchmark(load_table) + + +class TestBenchReload: + """Benchmarks for table.reload() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_PARAMS) + def test_reload( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Force reload table from disk.""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + def reload_table() -> None: + table.reload() + + benchmark(reload_table) + + +class TestBenchGet: + """Benchmarks for table.get() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_PARAMS) + def test_get_existing_key( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Get key from middle of dataset + middle_index = scale // 2 + target_key = generate_key(key_type, middle_index) + + def get_record() -> None: + _ = table.get(target_key) + + benchmark(get_record) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_get_nonexistent_key( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Generate a key that doesn't exist + nonexistent_key = generate_key(key_type, scale + 1000) + + def get_missing() -> None: + _ = table.get(nonexistent_key) + + benchmark(get_missing) + + +class TestBenchAll: + """Benchmarks for table.all() returning sorted records.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_WITH_EDGE_PARAMS) + def test_all( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + def get_all() -> None: + # Invalidate cache to measure full sort + table._cached_sorted_keys = None # noqa: SLF001 # pyright: ignore[reportPrivateUsage] + _ = table.all() + + benchmark(get_all) + + +class TestBenchFind: + """Benchmarks for table.find() with various selectivity.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_PARAMS) + def test_find_high_selectivity( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # High selectivity: match ~10% of records (count > 9000) + def predicate_high_count(r: "JSONObject") -> bool: + count = r.get("count", 0) + return isinstance(count, int) and count > 9000 + + def find_high_count() -> None: + _ = table.find(predicate_high_count) + + benchmark(find_high_count) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_low_selectivity( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Low selectivity: match ~90% of records (count < 9000) + def predicate_low_count(r: "JSONObject") -> bool: + count = r.get("count", 0) + return isinstance(count, int) and count < 9000 + + def find_low_count() -> None: + _ = table.find(predicate_low_count) + + benchmark(find_low_count) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_very_high_selectivity( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Find with ~1% selectivity (matches few records).""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Very high selectivity: match ~1% of records (count > 9900) + def predicate_very_selective(r: "JSONObject") -> bool: + count = r.get("count", 0) + return isinstance(count, int) and count > 9900 + + def find_very_selective() -> None: + _ = table.find(predicate_very_selective) + + benchmark(find_very_selective) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_all_records( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Find with 100% selectivity (matches all records).""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + def predicate_all(_r: "JSONObject") -> bool: + return True + + def find_all() -> None: + _ = table.find(predicate_all) + + benchmark(find_all) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_with_limit( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Find with limit - should short-circuit early + def predicate_active(r: "JSONObject") -> bool: + return r.get("active") is True + + def find_limited() -> None: + _ = table.find(predicate_active, limit=10) + + benchmark(find_limited) + + +class TestBenchFindOne: + """Benchmarks for table.find_one() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_one_match_early( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Find first record matching predicate (best case).""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + def predicate_any(_r: "JSONObject") -> bool: + return True + + def find_first() -> None: + _ = table.find_one(predicate_any) + + benchmark(find_first) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_one_match_late( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Find record with predicate matching late in dataset.""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Match only high count values (~1% of records) + def predicate_high_count(r: "JSONObject") -> bool: + count = r.get("count", 0) + return isinstance(count, int) and count > 9900 + + def find_late() -> None: + _ = table.find_one(predicate_high_count) + + benchmark(find_late) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_find_one_no_match( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Find with predicate that matches nothing (full scan).""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + def predicate_never(_r: "JSONObject") -> bool: + return False + + def find_none() -> None: + _ = table.find_one(predicate_never) + + benchmark(find_none) + + +class TestBenchPut: + """Benchmarks for table.put() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_put_new_record( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Generate new records to put (beyond existing range) + new_record_index = scale + 1 + counter = [new_record_index] + + def put_new() -> None: + new_record = generate_record(key_type, record_size, counter[0], seed=42) + table.put(new_record) + counter[0] += 1 + + benchmark(put_new) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_put_update_record( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + # Update existing records (cycling through them) + counter = [0] + + def put_update() -> None: + index = counter[0] % scale + updated_record = generate_record(key_type, record_size, index, seed=99) + table.put(updated_record) + counter[0] += 1 + + benchmark(put_update) + + +class TestBenchBatchWrite: + """Benchmarks for batched write operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_batch_put_10( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + batch_size = 10 + counter = [scale + 1] + + def batch_put() -> None: + start = counter[0] + for i in range(batch_size): + new_record = generate_record(key_type, record_size, start + i, seed=42) + table.put(new_record) + counter[0] += batch_size + + benchmark(batch_put) + + @pytest.mark.parametrize( + ("record_size", "key_type", "scale"), + [ + pytest.param("small", "string", 100, id="small-str-100"), + pytest.param("small", "integer", 100, id="small-int-100"), + ], + ) + def test_batch_put_100( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + batch_size = 100 + counter = [scale + 1] + + def batch_put() -> None: + start = counter[0] + for i in range(batch_size): + new_record = generate_record(key_type, record_size, start + i, seed=42) + table.put(new_record) + counter[0] += batch_size + + benchmark(batch_put) + + +class TestBenchCompact: + """Benchmarks for table.compact() operations. + + These benchmarks measure pure compact() performance by pre-populating + tables with history or tombstones during setup. + + Note: The benchmark fixture runs compact() multiple times. After the first + iteration, the table is already compacted, so subsequent iterations measure + the fast path (compacting a clean table). The reported time is amortized + across all iterations, with the first iteration doing the meaningful work. + """ + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_compact_with_history( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Compact a table with update history (superseded records).""" + history_count = max(scale // 10, 1) + table = create_table_with_history( + tmp_path, key_type, record_size, scale, history_count + ) + + def compact_only() -> None: + table.compact() + + benchmark(compact_only) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_compact_with_tombstones( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Compact a table with tombstones (deleted records).""" + tombstone_count = max(scale // 10, 1) + table = create_table_with_tombstones( + tmp_path, key_type, record_size, scale, tombstone_count + ) + + def compact_only() -> None: + table.compact() + + benchmark(compact_only) + + +class TestBenchKeys: + """Benchmarks for table.keys() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_WITH_EDGE_PARAMS) + def test_keys( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + def get_keys() -> None: + # Invalidate cache to measure full sort + table._cached_sorted_keys = None # noqa: SLF001 # pyright: ignore[reportPrivateUsage] + _ = table.keys() + + benchmark(get_keys) + + +class TestBenchItems: + """Benchmarks for table.items() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_WITH_EDGE_PARAMS) + def test_items( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Get all key-value pairs.""" + table = create_test_table(tmp_path, key_type, record_size, scale) + + def get_items() -> None: + # Invalidate cache to measure full sort + table._cached_sorted_keys = None # noqa: SLF001 # pyright: ignore[reportPrivateUsage] + _ = table.items() + + benchmark(get_items) + + +class TestBenchCount: + """Benchmarks for table.count() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_WITH_EDGE_PARAMS) + def test_count( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + def count() -> None: + _ = table.count() + + benchmark(count) + + +class TestBenchHas: + """Benchmarks for table.has() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), ALL_WITH_EDGE_PARAMS) + def test_has_existing( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + if scale == 0: + pytest.skip("Cannot test has_existing with scale=0 (no records)") + table = create_test_table(tmp_path, key_type, record_size, scale) + + middle_key = generate_key(key_type, scale // 2) + + def has_key() -> None: + _ = table.has(middle_key) + + benchmark(has_key) + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_has_nonexistent( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + table = create_test_table(tmp_path, key_type, record_size, scale) + + missing_key = generate_key(key_type, scale + 1000) + + def has_missing() -> None: + _ = table.has(missing_key) + + benchmark(has_missing) + + +class TestBenchDelete: + """Benchmarks for table.delete() operations.""" + + @pytest.mark.parametrize(("record_size", "key_type", "scale"), CI_PARAMS) + def test_delete_existing( + self, + benchmark: "BenchmarkFixture", + tmp_path: "Path", + record_size: RecordSize, + key_type: KeyType, + scale: int, + ) -> None: + """Delete existing records using unique keys per iteration.""" + # Create table with extra keys for benchmark iterations + table = create_extended_test_table( + tmp_path, key_type, record_size, scale, DELETE_ITERATION_BUFFER + ) + + # Counter starts at base scale (first extra key) + counter = [scale] + + def delete_unique() -> None: + key = generate_key(key_type, counter[0]) + _ = table.delete(key) + counter[0] += 1 + + benchmark(delete_unique) diff --git a/uv.lock b/uv.lock index 495801f..d52b255 100644 --- a/uv.lock +++ b/uv.lock @@ -852,7 +852,7 @@ wheels = [ [[package]] name = "jsonlt-python" -version = "0.1.0a2" +version = "0.1.0a3" source = { editable = "." } dependencies = [ { name = "typing-extensions" },