diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..0152fec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,53 @@ +name: Bug Report +description: Report something that isn't working correctly +labels: ["bug"] +body: + - type: textarea + id: description + attributes: + label: Describe the bug + description: A clear description of what's happening. + validations: + required: true + + - type: textarea + id: steps + attributes: + label: Steps to reproduce + description: Minimal steps or code to reproduce the behavior. + placeholder: | + 1. Create a scheduler with `Scheduler::builder()...` + 2. Submit a task with `...` + 3. See error... + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behavior + description: What you expected to happen. + validations: + required: true + + - type: input + id: version + attributes: + label: Taskmill version + description: The version of taskmill in your Cargo.toml + placeholder: "0.1.1" + validations: + required: true + + - type: input + id: os + attributes: + label: Operating system + placeholder: "Ubuntu 24.04 / macOS 15 / etc." + + - type: textarea + id: logs + attributes: + label: Relevant logs + description: Paste any relevant log output or panic messages. + render: shell diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..e1acadf --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,31 @@ +name: Feature Request +description: Suggest a new feature or improvement +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem or use case + description: What are you trying to do? What problem does this solve? + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed solution + description: How do you think this should work? + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: Any other approaches you've thought about or workarounds you're using. + + - type: textarea + id: context + attributes: + label: Additional context + description: Anything else — screenshots, links, related issues. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..ff53739 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,10 @@ +## Summary + + + +## Test plan + + + +- [ ] `cargo test --all-features` passes +- [ ] `cargo clippy --all-features -- -D warnings` is clean diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..d35056c --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,36 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +env: + CARGO_TERM_COLOR: always + +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + + - name: Cache cargo + uses: Swatinem/rust-cache@v2 + + - name: Build + run: cargo build --all-features + + - name: Run tests + run: cargo test --all-features + + - name: Check formatting + run: cargo fmt --check + + - name: Run clippy + run: cargo clippy --all-features -- -D warnings diff --git a/.github/workflows/pre-release.yml b/.github/workflows/pre-release.yml new file mode 100644 index 0000000..414ab56 --- /dev/null +++ b/.github/workflows/pre-release.yml @@ -0,0 +1,65 @@ +name: Pre-release + +on: + workflow_dispatch: + inputs: + version: + description: "Pre-release version (e.g. 0.1.0-alpha.1, 0.2.0-rc.1)" + required: true + type: string + +env: + CARGO_TERM_COLOR: always + +jobs: + pre-release: + name: Publish pre-release + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Validate version format + run: | + if ! echo "${{ inputs.version }}" | grep -qP '^\d+\.\d+\.\d+-(alpha|beta|rc)\.\d+$'; then + echo "::error::Invalid version format '${{ inputs.version }}'. Expected: X.Y.Z-(alpha|beta|rc).N" + exit 1 + fi + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo + uses: Swatinem/rust-cache@v2 + + - name: Set version in Cargo.toml + run: sed -i 's/^version = ".*"/version = "${{ inputs.version }}"/' Cargo.toml + + - name: Verify build + run: cargo build --all-features + + - name: Run tests + run: cargo test --all-features + + - name: Publish to crates.io + run: cargo publish --allow-dirty + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + - name: Create git tag + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git tag -a "v${{ inputs.version }}" -m "Pre-release v${{ inputs.version }}" + git push origin "v${{ inputs.version }}" + + - name: Create GitHub pre-release + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create "v${{ inputs.version }}" \ + --title "v${{ inputs.version }}" \ + --generate-notes \ + --prerelease diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml new file mode 100644 index 0000000..965d154 --- /dev/null +++ b/.github/workflows/release-plz.yml @@ -0,0 +1,63 @@ +name: Release-plz + +on: + push: + branches: + - main + +jobs: + release-plz-release: + name: Release-plz release + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo + uses: Swatinem/rust-cache@v2 + + - name: Run release-plz + uses: release-plz/action@v0.5 + with: + command: release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + release-plz-pr: + name: Release-plz PR + runs-on: ubuntu-latest + permissions: + contents: write + pull-requests: write + concurrency: + group: release-plz-${{ github.ref }} + cancel-in-progress: false + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo + uses: Swatinem/rust-cache@v2 + + - name: Run release-plz + uses: release-plz/action@v0.5 + with: + command: release-pr + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2e70d8c --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,34 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.1.1](https://github.com/deepjoy/shoebox/compare/taskmill-v0.1.0...taskmill-v0.1.1) - 2026-03-10 + +### Added + +- add pagination, filtering, query optimization, and trigger-based staleness for duplicatesFunctional improvements ([#53](https://github.com/deepjoy/shoebox/pull/53)) + +### Fixed + +- *(taskmill)* flush WAL and close database connection on shutdown ([#57](https://github.com/deepjoy/shoebox/pull/57)) + +## [0.1.0](https://github.com/deepjoy/shoebox/releases/tag/taskmill-v0.1.0) - 2026-03-05 + +### Added + +- *(taskmill)* type-keyed state map with post-build injection ([#46](https://github.com/deepjoy/shoebox/pull/46)) +- *(taskmill)* requeue duplicate submissions when task is running ([#45](https://github.com/deepjoy/shoebox/pull/45)) +- *(taskmill)* add adaptive priority task scheduler with IO-aware concurrency ([#38](https://github.com/deepjoy/shoebox/pull/38)) + +### Fixed + +- *(taskmill)* resolve SQLite BUSY errors with proper transaction handling ([#40](https://github.com/deepjoy/shoebox/pull/40)) + +### Other + +- *(taskmill)* separate priority from task payload, upgrade on dedup ([#44](https://github.com/deepjoy/shoebox/pull/44)) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..788f377 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,61 @@ +# Contributing to Taskmill + +Thanks for your interest in contributing! This document covers the basics to get you started. + +## Getting Started + +### Prerequisites + +- [Rust](https://rustup.rs/) (stable toolchain, MSRV 1.75) +- [lefthook](https://github.com/evilmartians/lefthook) (git hooks) + +### Setup + +```bash +git clone https://github.com/deepjoy/taskmill.git +cd taskmill +lefthook install +cargo build +``` + +### Running Tests + +```bash +cargo test --all-features +``` + +### Formatting and Linting + +The project uses `cargo fmt` and `clippy`. Lefthook runs these automatically on pre-commit, but you can run them manually: + +```bash +cargo fmt --check +cargo clippy --all-features -- -D warnings +``` + +## Making Changes + +1. Fork the repository and create a branch from `main`. +2. Make your changes. +3. Add tests for new functionality. +4. Ensure `cargo test --all-features` passes. +5. Ensure `cargo clippy --all-features -- -D warnings` is clean. +6. Open a pull request against `main`. + +## Commit Messages + +This project uses [Conventional Commits](https://www.conventionalcommits.org/): + +``` +feat: add new feature +fix: correct a bug +docs: update documentation +refactor: restructure code without behavior change +chore: maintenance tasks +``` + +These are used by [release-plz](https://release-plz.ino.rs/) to auto-generate changelogs and determine version bumps. + +## Questions? + +Open an issue or start a discussion — happy to help. diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1d39928 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "taskmill" +version = "0.1.1" +edition = "2021" +rust-version = "1.75" +license = "MIT" +description = "Adaptive priority work scheduler with IO-aware concurrency and SQLite persistence" +keywords = ["scheduler", "priority-queue", "task", "async"] +categories = ["asynchronous", "concurrency"] +repository = "https://github.com/deepjoy/taskmill" + +[features] +default = ["sysinfo-monitor"] +sysinfo-monitor = ["dep:sysinfo"] + +[dependencies] +tokio = { version = "1", features = ["sync", "time", "rt", "macros"] } +tokio-util = "0.7" +sqlx = { version = "0.8", features = ["runtime-tokio", "sqlite", "chrono"] } +tracing = "0.1" +thiserror = "2.0" +chrono = { version = "0.4", features = ["serde"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +sha2 = "0.10" +sysinfo = { version = "0.33", optional = true } + +[dev-dependencies] +tokio = { version = "1", features = ["full", "test-util"] } diff --git a/LICENSE b/LICENSE index daacbef..d5e6293 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2026 DJ Majumdar +Copyright (c) 2026 Deep Joy Majumdar Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea850cc --- /dev/null +++ b/README.md @@ -0,0 +1,121 @@ +# Taskmill + +Adaptive priority work scheduler with IO-aware concurrency and SQLite persistence. + +Taskmill is an async task queue for Rust applications that persists work to SQLite, +schedules by priority with IO-budget awareness, and supports preemption, retries, and +composable backpressure. Designed for desktop apps (Tauri, etc.) and background services +where tasks have measurable IO costs and the system needs to avoid saturating disk +throughput. + +## Quick example + +```rust +use std::sync::Arc; +use std::time::Duration; +use tokio_util::sync::CancellationToken; +use taskmill::{ + Scheduler, Priority, TaskSubmission, TaskExecutor, + TaskContext, TaskResult, TaskError, ShutdownMode, +}; + +struct ThumbnailGenerator; + +impl TaskExecutor for ThumbnailGenerator { + async fn execute<'a>( + &'a self, ctx: &'a TaskContext, + ) -> Result { + ctx.progress.report(0.5, Some("resizing".into())); + Ok(TaskResult { actual_read_bytes: 4096, actual_write_bytes: 1024 }) + } +} + +#[tokio::main] +async fn main() { + let scheduler = Scheduler::builder() + .store_path("tasks.db") + .executor("thumbnail", Arc::new(ThumbnailGenerator)) + .max_concurrency(8) + .with_resource_monitoring() + .build() + .await + .unwrap(); + + scheduler.submit(&TaskSubmission::with_payload( + "thumbnail", + Priority::NORMAL, + &serde_json::json!({"path": "/photos/img.jpg"}), + 4096, 1024, + ).unwrap()).await.unwrap(); + + let token = CancellationToken::new(); + scheduler.run(token).await; +} +``` + +## Shared scheduler (library embedding) + +A single `Scheduler` can be shared across an application and any libraries it embeds. +Multiple state types can coexist — each is keyed by its concrete `TypeId`, and new state +can be injected after the scheduler is built via `register_state`. + +```rust +use std::sync::Arc; +use taskmill::Scheduler; + +// The host app builds the scheduler and registers its own executors. +let scheduler = Scheduler::builder() + .store_path("app.db") + .executor("thumbnail", Arc::new(ThumbnailGenerator)) + .app_state(MyAppServices { /* ... */ }) + .max_concurrency(4) + .build() + .await + .unwrap(); + +// A library can inject its own state after build. +scheduler.register_state(Arc::new(LibraryState { /* ... */ })).await; + +// Both the host and the library submit tasks to the same queue. +// The host manages the run loop. +let token = CancellationToken::new(); +scheduler.run(token).await; +``` + +## Features + +- **SQLite persistence** — tasks survive restarts; crash recovery requeues interrupted work +- **256-level priority queue** — with preemption of lower-priority tasks +- **IO-aware scheduling** — defers work when disk throughput is saturated +- **Key-based deduplication** — SHA-256 keys prevent duplicate submissions +- **Composable backpressure** — plug in external pressure signals with custom throttle policies +- **Cross-platform resource monitoring** — CPU and disk IO via `sysinfo` (Linux, macOS, Windows) +- **Retries** — automatic requeue of retryable failures with configurable limits +- **Progress reporting** — executor-reported and throughput-extrapolated progress +- **Lifecycle events** — broadcast events for UI integration (Tauri, etc.) +- **Typed payloads** — serialize/deserialize structured task data +- **Batch submission** — bulk enqueue in a single SQLite transaction +- **Graceful shutdown** — configurable drain timeout before force-cancellation +- **Global pause/resume** — pause all work when the app is backgrounded +- **Type-keyed application state** — register multiple state types, inject pre- or post-build +- **Clone-friendly** — `Scheduler` is `Clone` via `Arc` for easy sharing +- **Serde on all public types** — ready for Tauri IPC + +For a detailed breakdown of every feature, see [docs/features.md](docs/features.md). + +## Documentation + +| Guide | Description | +|-------|-------------| +| [Quick Start](docs/quick-start.md) | Installation, first executor, builder setup, and running the scheduler | +| [Features](docs/features.md) | Complete feature list with descriptions | +| [Priorities & Preemption](docs/priorities-and-preemption.md) | Priority levels, preemption mechanics, and throttle behavior | +| [IO Tracking & Backpressure](docs/io-and-backpressure.md) | IO budgets, resource monitoring, pressure sources, and throttle policies | +| [Persistence & Recovery](docs/persistence-and-recovery.md) | SQLite schema, crash recovery, deduplication, and history retention | +| [Progress Reporting](docs/progress-reporting.md) | Executor progress, extrapolation, dashboard snapshots, and lifecycle events | +| [Configuration](docs/configuration.md) | All configuration options for scheduler, store, sampler, and feature flags | +| [Query APIs](docs/query-apis.md) | Full `TaskStore` query reference for dashboards and debugging | + +## License + +MIT diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..bc6c20e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,20 @@ +# Security Policy + +## Reporting a Vulnerability + +If you discover a security vulnerability in Taskmill, please report it responsibly. + +**Email:** [code@deepjoy.com](mailto:code@deepjoy.com) + +Please include: +- A description of the vulnerability +- Steps to reproduce +- Potential impact + +I'll acknowledge your report within 48 hours and aim to release a fix within 7 days for critical issues. Please don't open a public issue for security vulnerabilities. + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.1.x | Yes | diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md new file mode 100644 index 0000000..910808a --- /dev/null +++ b/docs/ARCHITECTURE.md @@ -0,0 +1,546 @@ +# Taskmill Architecture + +Taskmill is an adaptive priority work scheduler with IO-aware concurrency and +SQLite persistence, designed for desktop apps (Tauri) and background services. + +## Module map + +``` +taskmill/ + src/ + lib.rs — public API re-exports + task.rs — TaskRecord, TaskSubmission, TaskResult, TaskError, TypedTask, etc. + priority.rs — Priority newtype (u8, lower = higher priority) + store.rs — TaskStore: SQLite persistence, atomic pop, queries, retention + registry.rs — TaskExecutor trait (RPITIT), TaskContext, TaskTypeRegistry + backpressure.rs — PressureSource trait, ThrottlePolicy, CompositePressure + scheduler/ + mod.rs — Scheduler, SchedulerBuilder, run loop, events, snapshot + gate.rs — DispatchGate trait, DefaultDispatchGate, IO budget check + dispatch.rs — ActiveTaskMap, spawn_task(), preemption + progress.rs — ProgressReporter, EstimatedProgress, throughput extrapolation + resource/ + mod.rs — ResourceSampler + ResourceReader traits, ResourceSnapshot + sampler.rs — EWMA-smoothed background loop, SmoothedReader + sysinfo_monitor.rs — SysinfoSampler via `sysinfo` crate (feature-gated) + migrations/ + 001_tasks.sql — tasks table, task_history table, indexes +``` + +## Task lifecycle + +``` +Submit ──► Pending ──► Running ──► Completed (moved to task_history) + │ │ + │ ├──► Failed (moved to task_history, or retried) + │ │ + │ └──► Paused (preempted by higher-priority work) + │ │ + └─────────────────┘ (resumed when preemptors finish) +``` + +Active-queue states (`tasks` table): `pending`, `running`, `paused`. +Terminal states (`task_history` table): `completed`, `failed`. + +## Data flow + +```mermaid +flowchart TD + S["submit() / submit_batch()"] --> TS["TaskStore\n(INSERT OR IGNORE)"] + TS --> |SQLite| DB[(tasks table)] + DB --> SCH["Scheduler run loop"] + SCH --> |"tokio::spawn"| E1["Executor + TaskContext"] + SCH --> |"tokio::spawn"| E2["Executor + TaskContext"] + E1 --> CF["complete() / fail()"] + E2 --> CF + CF --> HIST[(task_history)] + CF --> PRUNE["maybe_prune()\n(amortised retention)"] + CF --> EVT["broadcast::Sender\n(SchedulerEvent)"] +``` + +## SQLite schema + +### `tasks` — active queue + +| Column | Purpose | +|-----------------------|----------------------------------------------------| +| `id` | `INTEGER PRIMARY KEY` — insertion order within tier| +| `task_type` | Executor lookup name (e.g. `"scan-l3"`) | +| `key` | `UNIQUE` — SHA-256 deduplication key | +| `priority` | `INTEGER NOT NULL` — 0 (highest) to 255 (lowest) | +| `status` | `TEXT` — `pending`, `running`, or `paused` | +| `payload` | `BLOB` — opaque, max 1 MiB, executor-defined | +| `expected_read_bytes` | Caller's IO estimate for scheduling decisions | +| `expected_write_bytes`| Caller's IO estimate for scheduling decisions | +| `retry_count` | Incremented on each retryable failure | +| `last_error` | Most recent error message (for diagnostics) | +| `started_at` | Set when popped; cleared on pause | + +A partial index `idx_tasks_pending` on `(status, priority ASC, id ASC) WHERE +status = 'pending'` covers the scheduler's hot path (`pop_next`), making +priority-ordered pops efficient regardless of how many running or paused tasks +sit in the table. + +### `task_history` — terminal records + +Completed and failed tasks are moved here atomically (delete from `tasks`, +insert into `task_history` in one transaction). Additional columns: + +| Column | Purpose | +|-----------------------|----------------------------------------------------| +| `actual_read_bytes` | Reported by executor on completion | +| `actual_write_bytes` | Reported by executor on completion | +| `completed_at` | Timestamp of completion or failure | +| `duration_ms` | Computed from `started_at` to `completed_at` | + +An index `idx_history_type_completed` on `(task_type, completed_at DESC)` +supports IO learning queries (`avg_throughput`, `history_stats`). + +### Connection pool + +Defaults to 16 connections (`StoreConfig::max_connections`). SQLite serialises +writes regardless, so this primarily benefits concurrent reads from multiple +Tauri commands and background tasks. + +### Retention policy + +`StoreConfig::retention_policy` controls automatic pruning of `task_history`: + +- `RetentionPolicy::MaxCount(n)` — keep at most N history records +- `RetentionPolicy::MaxAgeDays(n)` — keep records from the last N days + +Pruning is amortised: an `AtomicU64` completion counter triggers `maybe_prune()` +every `prune_interval` completions (default 100) rather than after every single +completion. Pruning errors are logged but never propagated — the task itself is +already committed. Manual pruning is available via `prune_history_by_count()` and +`prune_history_by_age()`. + +## Deduplication + +Key generation: `SHA-256(task_type + ":" + (explicit_key OR payload))`. The task +type is always incorporated so different types with identical payloads never +collide. + +Enforcement uses the `UNIQUE(key)` constraint with `INSERT OR IGNORE` — a +duplicate submission silently returns `None`. The key stays occupied while the +task is active (including retries) and is freed when the task moves to history. + +## Priority queue + +The priority queue lives entirely in SQLite. `pop_next()` is an atomic +`UPDATE ... RETURNING` that claims the highest-priority pending row: + +```sql +UPDATE tasks SET status = 'running', started_at = datetime('now') +WHERE id = ( + SELECT id FROM tasks WHERE status = 'pending' + ORDER BY priority ASC, id ASC LIMIT 1 +) +RETURNING * +``` + +`priority ASC` means lower numeric values are popped first (higher priority). +`id ASC` breaks ties by insertion order (FIFO within a tier). The partial index +makes this a single index scan. + +The `Priority` type is a `u8` newtype with named constants: + +| Constant | Value | Behaviour | +|--------------|-------|---------------------------------------| +| `REALTIME` | 0 | Never throttled, triggers preemption | +| `HIGH` | 64 | Throttled only under extreme pressure | +| `NORMAL` | 128 | Standard background work | +| `BACKGROUND` | 192 | Paused under moderate pressure | +| `IDLE` | 255 | Runs only when system is idle | + +`Ord` is reversed so `REALTIME > IDLE` semantically. Custom tiers are available +via `Priority::new(n)`. + +## Scheduler architecture + +The scheduler is split across four files: + +| File | Concern | +|----------------|---------------------------------------------------------------| +| `mod.rs` | Orchestration: run loop, submit, cancel, snapshot, builder | +| `gate.rs` | Admission control: backpressure + IO budget | +| `dispatch.rs` | Task lifecycle: active map, spawn, preemption | +| `progress.rs` | Progress reporting + throughput-based extrapolation | + +### Dispatch cycle + +```mermaid +flowchart TD + START["tick / notify"] --> PAUSED{"is_paused?"} + PAUSED -- yes --> WAIT + PAUSED -- no --> RESUME["Resume paused tasks\n(if no active preemptors)"] + RESUME --> CONC{"active < max_concurrency?"} + CONC -- no --> WAIT["Wait for next tick / notify"] + CONC -- yes --> PEEK["peek_next()\n(non-mutating)"] + PEEK -- empty --> WAIT + PEEK -- candidate --> GATE{"gate.admit()\nbackpressure + IO budget"} + GATE -- rejected --> WAIT + GATE -- admitted --> POP["pop_by_id()\n(atomic claim)"] + POP -- claimed --> REG{"Executor registered?"} + POP -- gone --> CONC + REG -- no --> FAIL["Fail immediately"] + REG -- yes --> SPAWN["spawn_task()"] + SPAWN --> CONC +``` + +The run loop wakes on two signals: + +1. **`Notify`** — triggered by `submit()`, `submit_batch()`, and `resume_all()`, + so newly enqueued work is picked up without waiting for the next tick. +2. **`poll_interval` timer** (default 500 ms) — fallback for paused-task + resumption and periodic housekeeping. + +Key design: the loop uses **peek-then-pop-by-id** rather than a bare `pop_next()`. +The gate inspects the candidate without mutating the queue; only after admission +does `pop_by_id()` atomically claim it. If another consumer claimed it in the +meantime, the loop simply retries. This eliminates the earlier race where a +popped-then-rejected task needed an explicit requeue step. + +Each stage independently halts dispatch: + +- **Concurrency** — hard cap via `max_concurrency` (`AtomicUsize`, adjustable at runtime) +- **DispatchGate** — pluggable admission (default: backpressure + IO budget) +- **Empty queue** — no pending tasks + +### Clone-friendly design + +`Scheduler` wraps all shared state in `Arc` and derives `Clone`: + +- Holds directly in `tauri::State` without extra `Arc` wrapping +- Cheap clones that share the underlying store, registry, and active map + +### Builder + +```rust +Scheduler::builder() + .store_path("tasks.db") + .executor("scan", Arc::new(ScanExecutor)) + .executor("exif", Arc::new(ExifExecutor)) + .pressure_source(Box::new(battery_pressure)) + .max_concurrency(8) + .shutdown_mode(ShutdownMode::Graceful(Duration::from_secs(30))) + .with_resource_monitoring() + .app_state(MyServices { http, db, cache }) + .build() + .await?; +``` + +The builder handles: opening the store, assembling the registry, composing +pressure sources, spawning the resource sampler, and wiring the `SmoothedReader`. +The lower-level `Scheduler::new()` remains for advanced use. + +## Dispatch gate (internal) + +The `DispatchGate` trait (`pub(crate)`) controls admission. The default +`DefaultDispatchGate` applies two checks: + +1. **Backpressure** — `ThrottlePolicy::should_throttle(priority, pressure)`. +2. **IO budget** — `has_io_headroom()`, described below. + +The trait also exposes `pressure()` and `pressure_breakdown()` (with default +no-op impls) so `Scheduler::snapshot()` can read backpressure state without +knowing the concrete gate type. + +## IO-aware scheduling + +### Expected vs actual IO + +Callers provide `expected_read_bytes` / `expected_write_bytes` on submission. +Executors report `actual_read_bytes` / `actual_write_bytes` on completion. The +history table stores both, enabling learning via `avg_throughput()` and +`history_stats()`. + +### IO budget heuristic + +When a `ResourceReader` is present, `has_io_headroom()` runs before each +dispatch: + +1. Read the latest EWMA-smoothed `ResourceSnapshot` (disk bytes/sec). +2. Sum expected IO across all currently running tasks. +3. Compute a 2-second budget window: `capacity = bytes_per_sec * 2.0`. +4. Defer if running IO exceeds 80% of capacity on either read or write axis. + +If no reader is configured the check is skipped (always allows dispatch). + +### Resource monitoring + +Two traits split sampling from consumption: + +- **`ResourceSampler`** — `sample() -> ResourceSnapshot`. Raw platform readings. +- **`ResourceReader`** — `latest() -> ResourceSnapshot`. Read-only, sync. + +`SmoothedReader` bridges them: the `run_sampler()` background loop calls +`sampler.sample()` at a configurable interval (default 1 s), applies EWMA +smoothing (alpha 0.3), and writes to the `SmoothedReader`. The scheduler reads +via `reader.latest()`, which uses `RwLock` so readers never block each other. + +The built-in `SysinfoSampler` (behind the `sysinfo-monitor` feature) provides +cross-platform CPU and disk IO via the `sysinfo` crate. + +## Backpressure + +### PressureSource trait + +```rust +pub trait PressureSource: Send + Sync + 'static { + fn pressure(&self) -> f32; // 0.0 (idle) to 1.0 (saturated) + fn name(&self) -> &str; +} +``` + +Implement for external signals: API rate, memory, queue depth, battery, etc. + +### CompositePressure + +Aggregates multiple sources. The composite value is the **max** across all — the +system is as pressured as its most constrained resource. `breakdown()` provides +per-source diagnostics. + +### ThrottlePolicy + +Default three-tier policy: + +| Priority range | Throttle threshold | +|-------------------|--------------------| +| BACKGROUND (192+) | > 50% pressure | +| NORMAL (128+) | > 75% pressure | +| HIGH / REALTIME | Never throttled | + +Custom policies via `ThrottlePolicy::new(thresholds)`. + +## Preemption + +When a task is submitted at or above `preempt_priority` (default `REALTIME`): + +1. All active tasks with strictly lower priority are cancelled + (`CancellationToken`) and moved to `paused` status in the store. +2. `Preempted` events are emitted. +3. On subsequent poll cycles, paused tasks are only resumed when no active + preemptors remain — this prevents a thrashing loop of pause/resume/re-preempt. + +Executors cooperate by checking `ctx.token.is_cancelled()` at yield points. An +executor that ignores cancellation continues running but is no longer tracked; +its completion or failure is still recorded normally. + +## Retry flow + +``` +Executor returns Err(TaskError) + └─ retryable: false? ──► move to task_history (failed) + └─ retryable: true? + └─ retry_count < max_retries? ──► status → pending, retry_count += 1 + └─ otherwise ──► move to task_history (failed) +``` + +- Retried tasks keep their original priority (no demotion). +- The dedup key remains occupied during retries. +- `max_retries` defaults to 3 (`SchedulerConfig`). + +## Event system + +`Scheduler::subscribe()` returns a `tokio::sync::broadcast::Receiver`: + +| Event | When | +|-------------|----------------------------------------------| +| `Dispatched`| Task popped and executor spawned | +| `Completed` | Task finished successfully | +| `Failed` | Task failed (includes `will_retry` flag) | +| `Preempted` | Task paused for higher-priority work | +| `Cancelled` | Task cancelled via `cancel()` | +| `Progress` | Executor reported progress (0.0–1.0) | +| `Paused` | Scheduler globally paused | +| `Resumed` | Scheduler globally resumed | + +All variants derive `Serialize`/`Deserialize`. + +## Progress reporting + +### Executor-reported + +Executors call `ctx.progress.report(percent, message)` or +`ctx.progress.report_fraction(completed, total, message)`. These emit +`SchedulerEvent::Progress` and update the active task map. + +### Throughput-extrapolated + +For tasks that don't report progress, `estimated_progress()` extrapolates from +elapsed time vs. the historical average duration for that task type. When a +partial report exists, the extrapolation blends historical and current throughput +for a more accurate estimate. + +`EstimatedProgress` provides `reported_percent`, `extrapolated_percent`, and a +unified `percent` (reported preferred over extrapolated). + +## Task type registry + +`TaskTypeRegistry` maps string names to executor implementations. The public +`TaskExecutor` trait uses RPITIT (`impl Future`) for ergonomic async; an internal +`ErasedExecutor` trait provides object-safe dynamic dispatch for storage. + +Duplicate registration panics — catches configuration errors at startup. When the +scheduler pops a task with no registered executor, it fails immediately with a +descriptive error. + +The registry is essential for crash recovery: after `recover_running()` resets +in-flight tasks to pending, the scheduler needs the registry to re-dispatch them. + +## Application state + +Executors often need shared services. Rather than capturing `Arc` per executor, +the scheduler provides a type-keyed `StateMap` that supports multiple state types: + +```rust +Scheduler::builder() + .app_state(MyServices { http, db, cache }) + .app_state(FeatureFlags { dark_mode: true }) + .build().await?; + +// In the executor: +let svc = ctx.state::().expect("state not set"); +let flags = ctx.state::().expect("flags not set"); +``` + +State flows: `SchedulerBuilder` collects `(TypeId, Arc)` entries → +assembled into `Arc` at build time → a `StateSnapshot` (lock-free +`HashMap` clone) is taken once per dispatch and placed in `TaskContext` → +executors call `ctx.state::()` which does a `TypeId` lookup + downcast. + +Libraries that embed a shared scheduler can inject their own state **after** +build via `scheduler.register_state(Arc::new(LibState { .. })).await`. This +is how shoebox injects `ScanAppState` into an externally-provided scheduler. + +This mirrors Axum's `State` / Tauri's `State` pattern. + +## Global pause / resume + +`pause_all()` sets an `AtomicBool` flag, cancels every running task's token, +moves them to paused status, and emits `Paused`. While paused the run loop skips +dispatch entirely. + +`resume_all()` clears the flag, wakes the run loop via `Notify`, and emits +`Resumed`. Paused tasks are picked up by the existing resumption logic on the +next cycle. + +`try_dispatch()` does **not** check the flag, so manual single-task dispatch +still works while globally paused. `SchedulerSnapshot::is_paused` reflects the +flag for UI integration. + +## Graceful shutdown + +`ShutdownMode` controls behaviour when the run loop's `CancellationToken` fires: + +- **`Hard`** (default) — cancel all running tasks immediately. +- **`Graceful(Duration)`** — stop dispatching, wait for running tasks to finish + (up to the timeout), then cancel stragglers. + +Both modes cancel the resource sampler's `CancellationToken`. + +## Crash recovery + +On `TaskStore::open()`, the store runs: + +```sql +UPDATE tasks SET status = 'pending', started_at = NULL WHERE status = 'running' +``` + +Any task mid-execution when the process died is reset to pending. This is safe +because executors should be idempotent (or check for partial work), the dedup key +stays occupied (no duplicates), and `retry_count` is preserved. + +## Thread safety + +- `Scheduler` — `Clone` via `Arc` +- `TaskStore` — `Clone` via `SqlitePool`; WAL journal mode for concurrent access +- `max_concurrency` — `AtomicUsize`, lock-free runtime adjustment +- `paused` — `AtomicBool` with `Release`/`Acquire` ordering +- `ActiveTaskMap` — `Arc>`, `Clone` +- `SmoothedReader` — `RwLock` so readers never block each other +- `TaskTypeRegistry` — immutable after startup, shared via `Arc` +- Application state — `Arc`, shared across all tasks +- Each spawned task gets its own `CancellationToken` +- All trait objects require `Send + Sync + 'static` + +## Feature flags + +- **`sysinfo-monitor`** (default) — enables `SysinfoSampler` for cross-platform + CPU and disk IO. Disable for mobile targets or when providing a custom sampler. + +Serde (`Serialize`/`Deserialize`) is always enabled on all public types. + +## Configuration reference + +### SchedulerConfig + +| Field | Default | Notes | +|--------------------------|---------------|------------------------------------| +| `max_concurrency` | 4 | Adjustable at runtime | +| `max_retries` | 3 | | +| `preempt_priority` | `REALTIME` | | +| `poll_interval` | 500 ms | Fallback; notify wakes sooner | +| `throughput_sample_size` | 20 | History rows for IO learning | +| `shutdown_mode` | `Hard` | | + +### StoreConfig + +| Field | Default | Notes | +|--------------------|---------|-------------------------------------------| +| `max_connections` | 16 | SQLite pool size | +| `retention_policy` | `None` | `MaxCount(n)` or `MaxAgeDays(n)` | +| `prune_interval` | 100 | Prune every N completions | + +### SamplerConfig + +| Field | Default | Notes | +|--------------|---------|--------------------------| +| `interval` | 1 s | Sample period | +| `ewma_alpha` | 0.3 | Smoothing factor (0–1) | + +## Tauri integration + +### State management + +```rust +app.manage(scheduler); // Scheduler is Clone — no Arc needed + +#[tauri::command] +async fn submit_task( + scheduler: tauri::State<'_, Scheduler>, +) -> Result, StoreError> { + scheduler.submit(&submission).await +} + +#[tauri::command] +async fn scheduler_status( + scheduler: tauri::State<'_, Scheduler>, +) -> Result { + scheduler.snapshot().await +} +``` + +### Event bridging + +```rust +let mut events = scheduler.subscribe(); +let handle = app_handle.clone(); +tokio::spawn(async move { + while let Ok(event) = events.recv().await { + handle.emit("taskmill-event", &event).unwrap(); + } +}); +``` + +### Error handling + +`StoreError` derives `Serialize`/`Deserialize`, so it can be returned directly +from Tauri commands without conversion. + +### Cross-platform + +Gate `sysinfo-monitor` for mobile: `default-features = false`. Provide a custom +`ResourceSampler` for iOS/Android if needed. Everything else (SQLite, scheduling, +events) works on all platforms. diff --git a/docs/configuration.md b/docs/configuration.md new file mode 100644 index 0000000..f5eb708 --- /dev/null +++ b/docs/configuration.md @@ -0,0 +1,147 @@ +# Configuration + +## SchedulerConfig + +Controls scheduling behavior. Set via builder methods or pass directly to `Scheduler::new()`. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `max_concurrency` | `usize` | 4 | Maximum concurrent running tasks. Adjustable at runtime via `set_max_concurrency()`. | +| `max_retries` | `i32` | 3 | Retry limit before a task is permanently failed. | +| `preempt_priority` | `Priority` | `REALTIME` (0) | Tasks at or above this priority trigger preemption of lower-priority work. | +| `poll_interval` | `Duration` | 500ms | Sleep between scheduler dispatch cycles. The scheduler also wakes on `Notify` signals. | +| `throughput_sample_size` | `i32` | 20 | Number of recent completions used for throughput-based progress extrapolation. | +| `shutdown_mode` | `ShutdownMode` | `Hard` | `Hard` cancels all tasks immediately. `Graceful(Duration)` waits up to the timeout. | + +### Builder methods + +```rust +use std::time::Duration; +use taskmill::{Scheduler, Priority, ShutdownMode}; + +let scheduler = Scheduler::builder() + .max_concurrency(8) + .max_retries(5) + .preempt_priority(Priority::HIGH) + .poll_interval(Duration::from_millis(250)) + .shutdown_mode(ShutdownMode::Graceful(Duration::from_secs(30))) + .build() + .await?; +``` + +## StoreConfig + +Controls the SQLite connection pool and history retention. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `max_connections` | `u32` | 16 | SQLite connection pool size. | +| `retention_policy` | `Option` | `None` | Automatic history pruning. `MaxCount(n)` or `MaxAgeDays(n)`. | +| `prune_interval` | `u64` | 100 | Number of task completions between automatic prune runs. | + +### Builder method + +```rust +use taskmill::{StoreConfig, RetentionPolicy}; + +let scheduler = Scheduler::builder() + .store_config(StoreConfig { + max_connections: 32, + retention_policy: Some(RetentionPolicy::MaxCount(10_000)), + prune_interval: 50, + ..Default::default() + }) + .build() + .await?; +``` + +## SamplerConfig + +Controls the resource monitoring background loop. + +| Field | Type | Default | Description | +|-------|------|---------|-------------| +| `interval` | `Duration` | 1s | How often to sample system resources. | +| `ewma_alpha` | `f64` | 0.3 | EWMA smoothing factor. Higher = more responsive to changes, lower = smoother. | + +### Builder method + +```rust +use std::time::Duration; +use taskmill::SamplerConfig; + +let scheduler = Scheduler::builder() + .with_resource_monitoring() + .sampler_config(SamplerConfig { + interval: Duration::from_millis(500), + ewma_alpha: 0.5, + }) + .build() + .await?; +``` + +## ShutdownMode + +| Variant | Behavior | +|---------|----------| +| `Hard` | Cancel all running tasks immediately when the scheduler stops. | +| `Graceful(Duration)` | Stop dispatching new tasks, wait for running tasks to complete (up to the timeout), then force-cancel any remaining. Stops the resource sampler afterward. | + +## RetentionPolicy + +| Variant | Behavior | +|---------|----------| +| `MaxCount(i64)` | Keep the N most recent history records, prune the rest. | +| `MaxAgeDays(i64)` | Keep records from the last N days, prune older entries. | + +## Priority constants + +| Constant | Value | Notes | +|----------|-------|-------| +| `Priority::REALTIME` | 0 | Highest. Never throttled. Triggers preemption. | +| `Priority::HIGH` | 64 | | +| `Priority::NORMAL` | 128 | Default for most tasks. | +| `Priority::BACKGROUND` | 192 | | +| `Priority::IDLE` | 255 | Lowest. | + +Custom: `Priority::new(n)` for any `u8` value. + +## Feature flags + +| Feature | Default | Description | +|---------|---------|-------------| +| `sysinfo-monitor` | Enabled | Cross-platform CPU and disk IO monitoring via `sysinfo`. Disable for mobile targets or custom samplers. | + +### Disabling platform monitoring + +```toml +[dependencies] +taskmill = { path = "crates/taskmill", default-features = false } +``` + +When disabled, you can still provide a custom `ResourceSampler` via `.resource_sampler()`. + +## Builder reference + +All `SchedulerBuilder` methods: + +| Method | Description | +|--------|-------------| +| `store_path(path)` | Path to the SQLite database file. | +| `store(store)` | Use a pre-opened `TaskStore`. | +| `store_config(config)` | Pool size and retention settings. | +| `executor(name, executor)` | Register a `TaskExecutor` by name. | +| `typed_executor::(executor)` | Register using `T::TASK_TYPE` as the name. | +| `max_concurrency(n)` | Set initial max concurrent tasks. | +| `max_retries(n)` | Set retry limit. | +| `preempt_priority(p)` | Set preemption threshold. | +| `poll_interval(d)` | Set dispatch cycle interval. | +| `shutdown_mode(mode)` | Set shutdown behavior. | +| `pressure_source(source)` | Add a `PressureSource` to the composite. | +| `throttle_policy(policy)` | Set a custom `ThrottlePolicy`. | +| `with_resource_monitoring()` | Enable platform resource monitoring. | +| `resource_sampler(sampler)` | Provide a custom `ResourceSampler`. | +| `sampler_config(config)` | Configure sample interval and smoothing. | +| `app_state(state)` | Register a state type (multiple types can coexist). | +| `app_state_arc(arc)` | Register a state type from a pre-existing `Arc`. | +| `build()` | Build and return the `Scheduler`. | diff --git a/docs/features.md b/docs/features.md new file mode 100644 index 0000000..83da43c --- /dev/null +++ b/docs/features.md @@ -0,0 +1,110 @@ +# Features + +A complete list of taskmill's capabilities. + +## Persistence + +- **SQLite-backed queue** — all tasks are stored in SQLite with WAL journal mode. Tasks survive process restarts, crashes, and power loss. +- **Crash recovery** — tasks left in `running` state during a crash are automatically reset to `pending` on startup. Dedup keys remain occupied so no duplicates sneak in during recovery. +- **Connection pooling** — configurable pool size (default 16) for concurrent reads. + +## Scheduling + +- **256-level priority queue** — priorities range from 0 (highest, `REALTIME`) to 255 (lowest, `IDLE`). Five named tiers are provided: `REALTIME`, `HIGH`, `NORMAL`, `BACKGROUND`, `IDLE`. Custom values like `Priority::new(100)` work too. +- **FIFO within tier** — tasks at the same priority are dispatched in insertion order. +- **Atomic dispatch** — pop operations use `UPDATE ... WHERE id = (SELECT ...) RETURNING *` for race-free claiming with no lost tasks. +- **Runtime-adjustable concurrency** — change `max_concurrency` at runtime via `set_max_concurrency()`. + +## Deduplication + +- **Key-based dedup** — each task gets a SHA-256 key derived from `task_type + payload` (or an explicit key). A `UNIQUE(key)` constraint with `INSERT OR IGNORE` prevents duplicate submissions. +- **Type-scoped keys** — the task type is always part of the hash, so different task types never collide even with identical payloads. +- **Lifecycle-aware** — keys are occupied while a task is pending, running, paused, or retrying. The key is freed when the task moves to history (completed or failed). +- **Batch-safe** — deduplication applies within `submit_batch()` transactions too. + +## IO Awareness + +- **Expected/actual IO tracking** — submit estimated read/write bytes; executors report actual bytes on completion. +- **IO budget gating** — the scheduler compares running task IO estimates against EWMA-smoothed system throughput. New work is deferred when cumulative IO would exceed 80% of observed disk capacity. +- **Learning from history** — `avg_throughput()` and `history_stats()` compute per-type IO averages from actual completions, enabling callers to refine estimates over time. + +## Resource Monitoring + +- **Cross-platform** — CPU and disk IO via `sysinfo` on Linux, macOS, and Windows. Feature-gated under `sysinfo-monitor` (enabled by default). +- **EWMA smoothing** — raw samples are smoothed with an exponentially weighted moving average (alpha=0.3, configurable) to avoid spiky readings. +- **Two-trait design** — `ResourceSampler` (raw platform readings) and `ResourceReader` (smoothed snapshots) are separated for testability and custom implementations. +- **Custom samplers** — disable the `sysinfo-monitor` feature and provide your own `ResourceSampler` for containers, cgroups, or mobile platforms. + +## Backpressure + +- **Composable pressure sources** — implement the `PressureSource` trait to expose a `0.0..=1.0` signal from any source (API load, memory, battery, queue depth). `CompositePressure` aggregates sources; the aggregate is the maximum across all. +- **Throttle policies** — `ThrottlePolicy` maps `(priority, pressure)` to dispatch decisions. The default three-tier policy throttles `BACKGROUND` tasks at >50% pressure, `NORMAL` at >75%, and never throttles `HIGH` or `REALTIME`. +- **Custom policies** — define your own thresholds for fine-grained control. + +## Preemption + +- **Priority-based preemption** — when a task at or above `preempt_priority` (default: `REALTIME`) is submitted, all lower-priority running tasks are cancelled and paused. +- **Token-based cancellation** — preempted tasks have their `CancellationToken` triggered. Executors should check `token.is_cancelled()` at yield points. +- **Anti-thrash protection** — paused tasks only resume when no active preemptors remain. + +## Retries + +- **Automatic requeue** — retryable failures (`TaskError { retryable: true }`) are requeued at the same priority with `retry_count += 1`. +- **Configurable limit** — `max_retries` (default 3) controls how many times a task can be retried before permanent failure. +- **Dedup preserved** — the key stays occupied during retries, preventing duplicate submission of in-progress work. + +## Progress Reporting + +- **Executor-reported progress** — report percentage or fraction-based progress via `ctx.progress.report()` or `ctx.progress.report_fraction()`. +- **Throughput-based extrapolation** — for tasks without explicit reports, the scheduler extrapolates progress from historical average duration, capped at 99% to avoid false completion signals. +- **Event-driven** — progress updates are emitted as `SchedulerEvent::Progress` for real-time UI updates. + +## Lifecycle Events + +- **Broadcast channel** — subscribe via `scheduler.subscribe()` to receive `SchedulerEvent` variants: `Dispatched`, `Completed`, `Failed`, `Preempted`, `Cancelled`, `Progress`, `Paused`, `Resumed`. +- **Tauri-ready** — all events are `Serialize`, designed for direct bridging to frontend via `app_handle.emit()`. + +## Task Management + +- **Task cancellation** — cancel running, pending, or paused tasks via `scheduler.cancel(task_id)`. +- **Global pause/resume** — `pause_all()` stops dispatch and pauses running tasks; `resume_all()` resumes on the next cycle. Emits events for UI integration. +- **Task lookup by dedup key** — `task_lookup()` searches both active and history tables for a task matching a given type and dedup input. + +## Typed Payloads + +- **Structured submission** — `TaskSubmission::with_payload()` serializes any `Serialize` type to JSON bytes. +- **Type-safe deserialization** — `TaskRecord::deserialize_payload::()` in executors. +- **TypedTask trait** — define `TASK_TYPE`, default priority, and expected IO on your struct. Submit with `scheduler.submit_typed()` and deserialize with `ctx.deserialize_typed()`. + +## Batch Operations + +- **Bulk enqueue** — `submit_batch()` wraps many inserts in a single SQLite transaction. Returns `Vec>` where `None` indicates deduplication. + +## Graceful Shutdown + +- **Hard mode** (default) — immediately cancels all running tasks. +- **Graceful mode** — stops dispatching, waits for running tasks up to a configurable timeout, then force-cancels stragglers. + +## Application State + +- **Type-keyed state map** — register multiple state types on the builder via `.app_state()` / `.app_state_arc()`. Each type is keyed by `TypeId`; access from any executor via `ctx.state::()`. +- **Post-build injection** — call `scheduler.register_state(arc)` after build to let libraries inject their own state into a shared scheduler. +- **Arc-based sharing** — state is wrapped in `Arc` internally; all tasks share the same instance. + +## History & Pruning + +- **Automatic retention** — configure `RetentionPolicy::MaxCount(n)` or `RetentionPolicy::MaxAgeDays(n)` for automatic history pruning. +- **Amortized pruning** — pruning runs every N completions (default 100, configurable) to avoid per-task overhead. +- **Manual pruning** — `prune_history_by_count()` and `prune_history_by_age()` for on-demand cleanup. + +## Dashboard + +- **Single-call snapshot** — `scheduler.snapshot()` returns a serializable `SchedulerSnapshot` with running tasks, queue depths, progress estimates, pressure readings, and concurrency limits. +- **Designed for Tauri commands** — return the snapshot directly from a `#[tauri::command]` handler. + +## Ergonomics + +- **Builder pattern** — `Scheduler::builder()` provides fluent construction with sensible defaults. +- **Clone-friendly** — `Scheduler` is `Clone` via `Arc` for easy sharing in Tauri state and across async tasks. +- **Serde on all public types** — every public struct and enum derives `Serialize`/`Deserialize` for Tauri IPC. +- **Serializable errors** — `StoreError` is serializable for direct use in Tauri command returns. diff --git a/docs/io-and-backpressure.md b/docs/io-and-backpressure.md new file mode 100644 index 0000000..6ec709d --- /dev/null +++ b/docs/io-and-backpressure.md @@ -0,0 +1,208 @@ +# IO Tracking & Backpressure + +Taskmill combines two independent gating mechanisms — IO budget tracking and composable backpressure — to avoid saturating system resources. + +## IO tracking + +### Submission estimates + +Every `TaskSubmission` includes expected IO: + +```rust +let sub = TaskSubmission { + task_type: "scan".into(), + key: None, + priority: Priority::NORMAL, + payload: Some(data), + expected_read_bytes: 50_000, // caller's estimate + expected_write_bytes: 10_000, +}; +``` + +### Completion actuals + +Executors report actual IO in `TaskResult`: + +```rust +Ok(TaskResult { + actual_read_bytes: 48_312, + actual_write_bytes: 9_876, +}) +``` + +Actual values are stored in `task_history` for learning. + +### IO budget gating + +When resource monitoring is enabled, the scheduler checks IO headroom before dispatching: + +1. Query EWMA-smoothed disk throughput from the `ResourceReader`. +2. Sum expected IO across all running tasks. +3. Compute a 2-second capacity window: `capacity = bytes_per_sec * 2.0`. +4. If running IO + candidate IO would exceed 80% of capacity on either axis (read or write), the task is deferred. + +This prevents the scheduler from piling up IO-heavy tasks that would saturate the disk. + +### Learning from history + +Use store queries to refine future estimates: + +```rust +let store = scheduler.store(); + +// Average read/write bytes per second for a task type (from recent completions) +let (avg_read_bps, avg_write_bps) = store.avg_throughput("scan", 20).await?; + +// Aggregate stats: count, avg duration, avg IO, failure rate +let stats = store.history_stats("scan").await?; +``` + +## Resource monitoring + +### Built-in platform sampler + +Enabled by default via the `sysinfo-monitor` feature flag. Provides CPU and disk IO on Linux, macOS, and Windows. + +```rust +let scheduler = Scheduler::builder() + .with_resource_monitoring() // uses SysinfoSampler automatically + .build() + .await?; +``` + +### Custom samplers + +For containers, cgroups, or mobile platforms, provide your own `ResourceSampler`: + +```rust +use taskmill::{ResourceSampler, ResourceSnapshot}; + +struct CgroupSampler; + +impl ResourceSampler for CgroupSampler { + fn sample(&mut self) -> ResourceSnapshot { + ResourceSnapshot { + cpu_usage: read_cgroup_cpu(), // 0.0–1.0 + io_read_bytes_per_sec: read_blkio_read(), + io_write_bytes_per_sec: read_blkio_write(), + } + } +} + +let scheduler = Scheduler::builder() + .resource_sampler(Box::new(CgroupSampler)) + .build() + .await?; +``` + +### EWMA smoothing + +Raw samples are smoothed via a `SmoothedReader` background loop: + +``` +smoothed = alpha * raw + (1 - alpha) * previous +``` + +- Default alpha: 0.3 (configurable via `SamplerConfig`) +- Default sample interval: 1 second +- Readers access snapshots via `RwLock` (readers never block each other) + +Configure smoothing: + +```rust +use std::time::Duration; +use taskmill::SamplerConfig; + +let scheduler = Scheduler::builder() + .with_resource_monitoring() + .sampler_config(SamplerConfig { + interval: Duration::from_millis(500), // sample faster + ewma_alpha: 0.5, // more responsive + }) + .build() + .await?; +``` + +## Backpressure + +### Pressure sources + +Implement the `PressureSource` trait to expose a `0.0..=1.0` signal from any external source: + +```rust +use taskmill::PressureSource; + +struct MemoryPressure; + +impl PressureSource for MemoryPressure { + fn pressure(&self) -> f32 { + let used = sys_info::mem_used(); + let total = sys_info::mem_total(); + (used as f32 / total as f32).min(1.0) + } + + fn name(&self) -> &str { "memory" } +} +``` + +### Composite pressure + +Multiple sources are aggregated via `CompositePressure`. The aggregate pressure is the **maximum** across all sources: + +```rust +use taskmill::CompositePressure; + +let mut pressure = CompositePressure::new(); +pressure.add_source(Arc::new(MemoryPressure)); +pressure.add_source(Arc::new(QueueDepthPressure)); +// Aggregate = max(memory_pressure, queue_pressure) +``` + +Or via the builder: + +```rust +let scheduler = Scheduler::builder() + .pressure_source(Arc::new(MemoryPressure)) + .pressure_source(Arc::new(QueueDepthPressure)) + .build() + .await?; +``` + +### Throttle policies + +`ThrottlePolicy` maps `(priority, pressure)` to dispatch decisions: + +```rust +use taskmill::{ThrottlePolicy, Priority}; + +// Default: BACKGROUND >50%, NORMAL >75%, HIGH/REALTIME never +let policy = ThrottlePolicy::default_three_tier(); + +// Custom thresholds +let policy = ThrottlePolicy::new(vec![ + (Priority::IDLE, 0.3), // throttle IDLE at 30% + (Priority::BACKGROUND, 0.6), // throttle BACKGROUND at 60% + (Priority::NORMAL, 0.8), // throttle NORMAL at 80% +]); +``` + +### How gating works + +The default `DispatchGate` combines both mechanisms. A task is dispatched only when **both** pass: + +1. **Backpressure check** — `ThrottlePolicy::should_throttle(priority, pressure)` returns false. +2. **IO budget check** — `has_io_headroom()` confirms the task won't saturate disk throughput. + +If either check fails, the task stays in the queue and is retried on the next poll cycle. + +### Diagnostics + +The `SchedulerSnapshot` includes pressure readings for debugging: + +```rust +let snap = scheduler.snapshot().await?; +println!("Aggregate pressure: {:.0}%", snap.pressure * 100.0); +for (name, value) in &snap.pressure_breakdown { + println!(" {}: {:.0}%", name, value * 100.0); +} +``` diff --git a/docs/persistence-and-recovery.md b/docs/persistence-and-recovery.md new file mode 100644 index 0000000..17cb840 --- /dev/null +++ b/docs/persistence-and-recovery.md @@ -0,0 +1,172 @@ +# Persistence & Recovery + +Taskmill persists all task state to SQLite, ensuring work survives process restarts, crashes, and power loss. + +## SQLite schema + +Two tables manage the task lifecycle: + +### `tasks` — active queue + +Holds pending, running, and paused tasks. + +| Column | Type | Description | +|--------|------|-------------| +| `id` | INTEGER PRIMARY KEY | Insertion-order ID | +| `task_type` | TEXT NOT NULL | Executor lookup name | +| `key` | TEXT NOT NULL UNIQUE | SHA-256 dedup key | +| `priority` | INTEGER NOT NULL | 0–255 (lower = higher priority) | +| `status` | TEXT DEFAULT 'pending' | `pending`, `running`, or `paused` | +| `payload` | BLOB | Opaque task data (max 1 MiB) | +| `expected_read_bytes` | INTEGER | Estimated read IO | +| `expected_write_bytes` | INTEGER | Estimated write IO | +| `retry_count` | INTEGER DEFAULT 0 | Number of retries so far | +| `last_error` | TEXT | Most recent error message | +| `created_at` | TEXT | ISO 8601 timestamp | +| `started_at` | TEXT | Set when dispatched, cleared on pause | + +**Index:** `idx_tasks_pending(status, priority ASC, id ASC) WHERE status = 'pending'` — partial index for efficient priority-ordered pop. + +### `task_history` — completed and failed tasks + +| Column | Type | Description | +|--------|------|-------------| +| *(all columns from `tasks`)* | | | +| `actual_read_bytes` | INTEGER | Reported by executor | +| `actual_write_bytes` | INTEGER | Reported by executor | +| `completed_at` | TEXT | ISO 8601 timestamp | +| `duration_ms` | INTEGER | Wall-clock duration | +| `status` | TEXT | `completed` or `failed` | + +**Index:** `idx_history_type_completed(task_type, completed_at DESC) WHERE status = 'completed'` — for per-type history queries and throughput calculations. + +## Crash recovery + +On startup, `TaskStore::open()` runs a recovery query: + +```sql +UPDATE tasks SET status = 'pending', started_at = NULL WHERE status = 'running' +``` + +This resets any tasks that were mid-execution when the process died. The behavior: + +- Tasks return to the priority queue at their original priority +- `retry_count` is preserved (crash doesn't count as a retry) +- Dedup keys remain occupied (no duplicate submissions during recovery) +- Tasks are re-dispatched in priority order on the next scheduler cycle + +## Deduplication + +### How keys are generated + +Every task gets a SHA-256 key: `SHA-256(task_type + ":" + (explicit_key OR payload))`. + +- **Implicit key** — if no `key` is provided, the payload bytes are used. Tasks with the same type and payload get the same key. +- **Explicit key** — set `TaskSubmission.key` to control deduplication yourself. Useful when two payloads represent the same logical work (e.g., different timestamps but same file path). +- **Type scoping** — the task type is always part of the hash, so `("resize", payload)` and `("compress", payload)` never collide. + +### Lifecycle + +A key is "occupied" while the task is in the `tasks` table (pending, running, paused, or retrying). When the task moves to `task_history` (completed or failed), the key is freed and can be resubmitted. + +### Submission behavior + +```rust +// Returns Some(id) if inserted +let id = scheduler.submit(&submission).await?; // Ok(Some(42)) + +// Returns None if a task with the same key already exists +let id = scheduler.submit(&submission).await?; // Ok(None) +``` + +`submit_batch()` applies the same dedup within a single transaction: + +```rust +let ids = scheduler.submit_batch(&[sub1, sub2, sub3]).await?; +// ids = [Some(1), None, Some(2)] — sub2 was a duplicate +``` + +### Looking up tasks by dedup key + +```rust +use taskmill::TaskLookup; + +let lookup = scheduler.task_lookup("resize", "/photos/img.jpg").await?; +match lookup { + TaskLookup::Active(record) => println!("still running: {:?}", record.status), + TaskLookup::History(record) => println!("completed: {:?}", record.completed_at), + TaskLookup::NotFound => println!("never submitted"), +} +``` + +## History retention + +Without pruning, `task_history` grows without bound. Configure automatic retention: + +### By count + +Keep the N most recent records: + +```rust +use taskmill::{StoreConfig, RetentionPolicy}; + +let scheduler = Scheduler::builder() + .store_config(StoreConfig { + retention_policy: Some(RetentionPolicy::MaxCount(10_000)), + ..Default::default() + }) + .build() + .await?; +``` + +### By age + +Keep records from the last N days: + +```rust +let scheduler = Scheduler::builder() + .store_config(StoreConfig { + retention_policy: Some(RetentionPolicy::MaxAgeDays(90)), + ..Default::default() + }) + .build() + .await?; +``` + +### Pruning frequency + +Pruning is amortized — it runs every N task completions (default 100, configurable via `StoreConfig::prune_interval`). Pruning errors are logged but don't affect the completed task. + +### Manual pruning + +```rust +let store = scheduler.store(); +let deleted = store.prune_history_by_count(5_000).await?; +let deleted = store.prune_history_by_age(30).await?; +``` + +## WAL mode + +The database uses SQLite WAL (Write-Ahead Logging) for concurrent reads with serialized writes. This means multiple readers can query task status while the scheduler is dispatching work. + +## Connection pooling + +The default pool size is 16 connections. Configure via `StoreConfig::max_connections`: + +```rust +let scheduler = Scheduler::builder() + .store_config(StoreConfig { + max_connections: 32, + ..Default::default() + }) + .build() + .await?; +``` + +## In-memory store for testing + +For tests, use an in-memory database that doesn't touch the filesystem: + +```rust +let store = TaskStore::open_memory().await?; +``` diff --git a/docs/priorities-and-preemption.md b/docs/priorities-and-preemption.md new file mode 100644 index 0000000..36b069a --- /dev/null +++ b/docs/priorities-and-preemption.md @@ -0,0 +1,105 @@ +# Priorities & Preemption + +## Priority levels + +Taskmill uses a 256-level priority scale where lower values mean higher priority. Five named constants are provided: + +| Constant | Value | Behavior | +|--------------|-------|----------| +| `REALTIME` | 0 | Never throttled. Triggers preemption of lower-priority work. | +| `HIGH` | 64 | Throttled only under extreme pressure (>75%). | +| `NORMAL` | 128 | Standard operations. Throttled at >75% pressure. | +| `BACKGROUND` | 192 | Deferred under moderate load. Throttled at >50% pressure. | +| `IDLE` | 255 | Runs only when the system is otherwise idle. Throttled at >50% pressure. | + +Custom values between tiers are supported: + +```rust +use taskmill::Priority; + +let custom = Priority::new(100); // between HIGH and NORMAL +``` + +## Queue ordering + +Tasks are popped from the queue in strict priority order (`ORDER BY priority ASC, id ASC`). Within the same priority tier, tasks are dispatched in insertion order (FIFO). + +A partial index on `(status, priority, id) WHERE status = 'pending'` keeps pop operations fast regardless of history size. + +## Preemption + +When a task with priority at or above `preempt_priority` (default: `REALTIME`) is submitted, the scheduler preempts lower-priority running work: + +1. **Cancel tokens** — the `CancellationToken` of every active task with lower priority (higher numeric value) is triggered. +2. **Pause in store** — preempted tasks are moved to `paused` status with `started_at` cleared. +3. **Emit events** — a `SchedulerEvent::Preempted` is emitted for each affected task. +4. **Resume later** — paused tasks are only re-dispatched when no active preemptors remain, preventing thrashing between competing priority tiers. + +### Handling preemption in executors + +Executors should check for cancellation at natural yield points: + +```rust +impl TaskExecutor for MyExecutor { + async fn execute<'a>( + &'a self, ctx: &'a TaskContext, + ) -> Result { + for chunk in chunks { + // Check before each unit of work + if ctx.token.is_cancelled() { + return Err(TaskError { + message: "preempted".into(), + retryable: true, + actual_read_bytes: bytes_read_so_far, + actual_write_bytes: bytes_written_so_far, + }); + } + + process(chunk).await; + ctx.progress.report_fraction(i, total, None); + } + + Ok(TaskResult { actual_read_bytes: total_read, actual_write_bytes: total_written }) + } +} +``` + +Returning a retryable error on preemption is optional — the scheduler handles pausing regardless. But it gives the executor a chance to report partial IO and clean up. + +### Configuring preemption threshold + +```rust +let scheduler = Scheduler::builder() + .preempt_priority(Priority::HIGH) // now HIGH and REALTIME both trigger preemption + .build() + .await?; +``` + +## Throttle behavior + +Throttling is independent of preemption. It controls whether a pending task is *dispatched*, not whether a running task is *interrupted*. + +The default three-tier `ThrottlePolicy`: + +| Priority tier | Throttled when pressure exceeds | +|---------------|-------------------------------| +| `BACKGROUND` (192+) | 50% | +| `NORMAL` (128+) | 75% | +| `HIGH` / `REALTIME` | Never | + +Pressure is an aggregate `0.0..=1.0` value from all registered `PressureSource` implementations (see [IO & Backpressure](io-and-backpressure.md)). + +### Custom throttle policies + +```rust +use taskmill::{ThrottlePolicy, Priority}; + +// Custom: throttle IDLE at 30%, BACKGROUND at 60%, NORMAL at 80% +let policy = ThrottlePolicy::new(vec![ + (Priority::IDLE, 0.3), + (Priority::BACKGROUND, 0.6), + (Priority::NORMAL, 0.8), +]); +``` + +Thresholds are evaluated from lowest priority (highest numeric value) first. A task is throttled if its priority is at or below the threshold tier and pressure exceeds the limit. diff --git a/docs/progress-reporting.md b/docs/progress-reporting.md new file mode 100644 index 0000000..2c65067 --- /dev/null +++ b/docs/progress-reporting.md @@ -0,0 +1,160 @@ +# Progress Reporting + +Taskmill provides real-time progress tracking for running tasks, combining executor-reported values with throughput-based extrapolation. + +## Reporting from executors + +Executors receive a `ProgressReporter` via `ctx.progress`: + +```rust +impl TaskExecutor for MyExecutor { + async fn execute<'a>( + &'a self, ctx: &'a TaskContext, + ) -> Result { + let items = get_work_items(); + + for (i, item) in items.iter().enumerate() { + process(item).await; + + // Percentage-based (0.0 to 1.0) + ctx.progress.report( + (i + 1) as f32 / items.len() as f32, + Some(format!("processed {}/{}", i + 1, items.len())), + ); + } + + Ok(TaskResult { actual_read_bytes: 0, actual_write_bytes: 0 }) + } +} +``` + +### Fraction-based reporting + +For count-based progress: + +```rust +ctx.progress.report_fraction(processed, total, Some("importing".into())); +// Automatically computes: processed as f32 / total as f32 +``` + +## Progress events + +Every `report()` call emits a `SchedulerEvent::Progress`: + +```rust +SchedulerEvent::Progress { + task_id: 42, + task_type: "resize".into(), + key: "abc123".into(), + percent: 0.5, + message: Some("resizing".into()), +} +``` + +Subscribe to events for real-time UI updates: + +```rust +let mut events = scheduler.subscribe(); +tokio::spawn(async move { + while let Ok(event) = events.recv().await { + if let SchedulerEvent::Progress { task_id, percent, message, .. } = event { + update_ui(task_id, percent, message); + } + } +}); +``` + +## Throughput-based extrapolation + +For tasks that don't report progress (or between reports), the scheduler extrapolates based on historical data: + +1. Fetch `history_stats(task_type)` to get the average duration for this task type. +2. Compute throughput: `1.0 / avg_duration_ms` (completion fraction per millisecond). +3. Multiply by elapsed time since `started_at` to get an extrapolated percentage. +4. If the executor has reported partial progress, blend the historical throughput with the current rate. +5. Cap at 99% — extrapolation never reaches 100% to avoid false "complete" signals. + +This means even tasks with no explicit progress reporting show movement in UI dashboards. + +## Querying progress + +### All running tasks + +```rust +let progress = scheduler.estimated_progress().await; +for p in &progress { + println!("{} ({}): {:.0}%", p.task_type, p.key, p.percent * 100.0); + // p.reported_percent — last executor-reported value (if any) + // p.extrapolated_percent — throughput-based estimate (if any) + // p.percent — best available: reported if present, else extrapolated +} +``` + +### Via snapshot + +The `SchedulerSnapshot` includes progress for all running tasks: + +```rust +let snap = scheduler.snapshot().await?; +for p in &snap.progress { + println!("{}: {:.0}%", p.key, p.percent * 100.0); +} +``` + +## Lifecycle events + +All scheduler state changes are broadcast as `SchedulerEvent` variants: + +| Event | When | +|-------|------| +| `Dispatched { task_id, task_type, key }` | Task popped from queue and executor spawned | +| `Completed { task_id, task_type, key }` | Task finished successfully | +| `Failed { task_id, task_type, key, error, will_retry }` | Task failed (includes whether it will be retried) | +| `Preempted { task_id, task_type, key }` | Task paused for higher-priority work | +| `Cancelled { task_id, task_type, key }` | Task cancelled via `scheduler.cancel()` | +| `Progress { task_id, task_type, key, percent, message }` | Progress update from executor | +| `Paused` | Scheduler globally paused via `pause_all()` | +| `Resumed` | Scheduler resumed via `resume_all()` | + +### Tauri bridge + +Bridge events to the frontend in a Tauri app: + +```rust +let mut events = scheduler.subscribe(); +let handle = app_handle.clone(); +tokio::spawn(async move { + while let Ok(event) = events.recv().await { + handle.emit("taskmill-event", &event).unwrap(); + } +}); +``` + +All events derive `Serialize`, so they can be sent directly over Tauri IPC. + +## Dashboard snapshot + +For UI dashboards, `Scheduler::snapshot()` gathers all scheduler state in a single call: + +```rust +let snap = scheduler.snapshot().await?; +// snap.running — Vec of currently executing tasks +// snap.pending_count — number of tasks waiting to dispatch +// snap.paused_count — number of preempted tasks +// snap.progress — Vec for every running task +// snap.pressure — aggregate backpressure (0.0–1.0) +// snap.pressure_breakdown — per-source diagnostics: Vec<(String, f32)> +// snap.max_concurrency — current concurrency limit +// snap.is_paused — whether the scheduler is globally paused +``` + +Return directly from a Tauri command: + +```rust +#[tauri::command] +async fn scheduler_status( + scheduler: tauri::State<'_, Scheduler>, +) -> Result { + scheduler.snapshot().await +} +``` diff --git a/docs/query-apis.md b/docs/query-apis.md new file mode 100644 index 0000000..f0acaee --- /dev/null +++ b/docs/query-apis.md @@ -0,0 +1,103 @@ +# Query APIs + +All queries are available on `TaskStore`, accessed via `scheduler.store()`. + +## Active task queries + +| Method | Returns | Description | +|--------|---------|-------------| +| `running_tasks()` | `Vec` | All running tasks, ordered by priority. | +| `running_count()` | `i64` | Count of running tasks. | +| `pending_tasks(limit)` | `Vec` | Pending tasks, ordered by priority then age. | +| `pending_count()` | `i64` | Count of pending tasks. | +| `pending_by_type(task_type)` | `Vec` | Pending tasks filtered by type. | +| `paused_tasks()` | `Vec` | All paused (preempted) tasks. | +| `paused_count()` | `i64` | Count of paused tasks. | +| `task_by_id(id)` | `Option` | Look up an active task by row ID. | +| `task_by_key(key)` | `Option` | Look up an active task by dedup key. | +| `running_io_totals()` | `(i64, i64)` | Sum of `(expected_read_bytes, expected_write_bytes)` across running tasks. | + +## History queries + +| Method | Returns | Description | +|--------|---------|-------------| +| `history(limit, offset)` | `Vec` | Paginated history, newest first. | +| `history_by_type(task_type, limit)` | `Vec` | History filtered by task type. | +| `history_by_key(key)` | `Vec` | All past runs matching a dedup key. | +| `failed_tasks(limit)` | `Vec` | Recent failures. | + +## Aggregate queries + +| Method | Returns | Description | +|--------|---------|-------------| +| `history_stats(task_type)` | `TypeStats` | Aggregate stats: count, avg duration, avg IO, failure rate. | +| `avg_throughput(task_type, limit)` | `(f64, f64)` | Average `(read_bytes/sec, write_bytes/sec)` from recent completions. | + +### TypeStats fields + +| Field | Type | Description | +|-------|------|-------------| +| `count` | `i64` | Total completed tasks of this type. | +| `avg_duration_ms` | `f64` | Average wall-clock duration. | +| `avg_read_bytes` | `f64` | Average actual read bytes. | +| `avg_write_bytes` | `f64` | Average actual write bytes. | +| `failure_rate` | `f64` | Fraction of tasks that failed (0.0–1.0). | + +## Unified lookup + +Search both active and history tables by dedup key: + +```rust +use taskmill::TaskLookup; + +let lookup = scheduler.task_lookup("resize", "/photos/img.jpg").await?; +match lookup { + TaskLookup::Active(record) => { + println!("Status: {:?}, priority: {}", record.status, record.priority.value()); + } + TaskLookup::History(record) => { + println!("Completed at: {:?}, duration: {}ms", record.completed_at, record.duration_ms); + } + TaskLookup::NotFound => { + println!("No task found with this key"); + } +} +``` + +Or with typed tasks: + +```rust +let lookup = scheduler.lookup_typed(&ResizeTask { + path: "/photos/img.jpg".into(), + width: 300, +}).await?; +``` + +## Pruning + +| Method | Returns | Description | +|--------|---------|-------------| +| `prune_history_by_count(keep)` | `u64` | Delete all but the N most recent history records. Returns count deleted. | +| `prune_history_by_age(days)` | `u64` | Delete history records older than N days. Returns count deleted. | + +## Usage example + +```rust +let store = scheduler.store(); + +// Dashboard data +let running = store.running_count().await?; +let pending = store.pending_count().await?; +let (read_io, write_io) = store.running_io_totals().await?; + +// Per-type analytics +let stats = store.history_stats("thumbnail").await?; +println!( + "thumbnail: {} completed, avg {:.0}ms, {:.1}% failure rate", + stats.count, stats.avg_duration_ms, stats.failure_rate * 100.0, +); + +// Paginated history for a UI table +let page = store.history(50, 0).await?; // first 50 +let page2 = store.history(50, 50).await?; // next 50 +``` diff --git a/docs/quick-start.md b/docs/quick-start.md new file mode 100644 index 0000000..bc3b871 --- /dev/null +++ b/docs/quick-start.md @@ -0,0 +1,211 @@ +# Quick Start + +## Installation + +Add taskmill to your `Cargo.toml`: + +```toml +[dependencies] +taskmill = { path = "crates/taskmill" } +``` + +To disable platform resource monitoring (e.g., for mobile targets): + +```toml +[dependencies] +taskmill = { path = "crates/taskmill", default-features = false } +``` + +## Implement an executor + +Each task type needs a `TaskExecutor` implementation. The executor receives a `TaskContext` containing: + +- `record` — the full `TaskRecord` with payload (up to 1 MiB), priority, retry count, etc. +- `token` — a `CancellationToken` for preemption support +- `progress` — a `ProgressReporter` for reporting progress back to the scheduler +- Shared application state (if registered via `.app_state()` or `register_state()`) + +```rust +use std::sync::Arc; +use taskmill::{TaskExecutor, TaskContext, TaskResult, TaskError}; + +struct ImageResizer; + +impl TaskExecutor for ImageResizer { + async fn execute<'a>( + &'a self, + ctx: &'a TaskContext, + ) -> Result { + // Deserialize your payload + let data: Option = ctx.record.deserialize_payload()?; + + // Check for preemption at yield points + if ctx.token.is_cancelled() { + return Err(TaskError { + message: "preempted".into(), + retryable: true, + actual_read_bytes: 0, + actual_write_bytes: 0, + }); + } + + // Report progress + ctx.progress.report(0.5, Some("resizing".into())); + + // Do work... + + Ok(TaskResult { + actual_read_bytes: 4096, + actual_write_bytes: 1024, + }) + } +} +``` + +## Build and run the scheduler + +```rust +use std::sync::Arc; +use std::time::Duration; +use tokio_util::sync::CancellationToken; +use taskmill::{Scheduler, Priority, TaskSubmission, ShutdownMode}; + +#[tokio::main] +async fn main() { + // Build the scheduler — opens the DB, registers executors, starts monitoring. + let scheduler = Scheduler::builder() + .store_path("tasks.db") + .executor("resize", Arc::new(ImageResizer)) + .max_concurrency(8) + .shutdown_mode(ShutdownMode::Graceful(Duration::from_secs(10))) + .with_resource_monitoring() + .build() + .await + .unwrap(); + + // Scheduler is Clone — share freely across async tasks and Tauri state. + let sched = scheduler.clone(); + + // Subscribe to lifecycle events for logging or UI updates. + let mut events = scheduler.subscribe(); + tokio::spawn(async move { + while let Ok(event) = events.recv().await { + println!("Event: {:?}", event); + } + }); + + // Submit a single task with a typed payload. + scheduler.submit(&TaskSubmission::with_payload( + "resize", + Priority::NORMAL, + &serde_json::json!({"path": "/photos/image.jpg", "width": 300}), + 4096, // expected read bytes + 1024, // expected write bytes + ).unwrap()).await.unwrap(); + + // Submit tasks in bulk (single SQLite transaction). + let paths = vec!["/a.jpg", "/b.jpg", "/c.jpg"]; + let batch: Vec<_> = paths.iter().map(|p| { + TaskSubmission::with_payload( + "resize", + Priority::NORMAL, + &serde_json::json!({"path": p}), + 4096, 1024, + ).unwrap() + }).collect(); + let ids = scheduler.submit_batch(&batch).await.unwrap(); + // ids[i] is Some(row_id) if inserted, None if deduplicated. + + // Run the scheduler loop (blocks until the token is cancelled). + let token = CancellationToken::new(); + scheduler.run(token).await; +} +``` + +## Using typed tasks + +For stronger type safety, implement the `TypedTask` trait: + +```rust +use serde::{Serialize, Deserialize}; +use taskmill::{TypedTask, Priority}; + +#[derive(Serialize, Deserialize)] +struct ResizeTask { + path: String, + width: u32, +} + +impl TypedTask for ResizeTask { + const TASK_TYPE: &'static str = "resize"; + + fn expected_read_bytes(&self) -> i64 { 4096 } + fn expected_write_bytes(&self) -> i64 { 1024 } + fn priority(&self) -> Priority { Priority::NORMAL } +} + +// Submit: +scheduler.submit_typed(&ResizeTask { + path: "/photos/img.jpg".into(), + width: 300, +}).await?; + +// In the executor: +let task: Option = ctx.deserialize_typed()?; +``` + +## Manual wiring + +For full control over components, use `Scheduler::new()` directly: + +```rust +use std::sync::Arc; +use taskmill::{ + CompositePressure, Scheduler, SchedulerConfig, + TaskStore, TaskTypeRegistry, ThrottlePolicy, +}; + +let store = TaskStore::open("tasks.db").await.unwrap(); + +let mut registry = TaskTypeRegistry::new(); +registry.register("resize", Arc::new(ImageResizer)); + +let pressure = CompositePressure::new(); +let policy = ThrottlePolicy::default_three_tier(); + +let scheduler = Scheduler::new( + store, + SchedulerConfig::default(), + Arc::new(registry), + pressure, + policy, +); +``` + +## Tauri integration + +Taskmill is designed for Tauri. A typical setup: + +```rust +use tauri::Manager; +use taskmill::{Scheduler, SchedulerSnapshot, StoreError}; + +// Expose scheduler status to the frontend. +#[tauri::command] +async fn scheduler_status( + scheduler: tauri::State<'_, Scheduler>, +) -> Result { + scheduler.snapshot().await +} + +// Bridge events to the frontend. +fn setup_events(app: &tauri::App, scheduler: &Scheduler) { + let mut events = scheduler.subscribe(); + let handle = app.handle().clone(); + tokio::spawn(async move { + while let Ok(event) = events.recv().await { + handle.emit("taskmill-event", &event).unwrap(); + } + }); +} +``` diff --git a/migrations/001_tasks.sql b/migrations/001_tasks.sql new file mode 100644 index 0000000..6e92f71 --- /dev/null +++ b/migrations/001_tasks.sql @@ -0,0 +1,64 @@ +-- Active queue: pending, running, and paused tasks. +-- The UNIQUE(key) constraint provides key-based deduplication — +-- submitting a task with an existing key is a no-op (INSERT OR IGNORE). +-- When a duplicate is submitted while the existing task is running or paused, +-- the requeue flag is set so the task re-runs after the current execution. +CREATE TABLE IF NOT EXISTS tasks ( + id INTEGER PRIMARY KEY, + task_type TEXT NOT NULL, + key TEXT NOT NULL, + priority INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + payload BLOB, + expected_read_bytes INTEGER NOT NULL DEFAULT 0, + expected_write_bytes INTEGER NOT NULL DEFAULT 0, + retry_count INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + started_at TEXT, + requeue INTEGER NOT NULL DEFAULT 0, + requeue_priority INTEGER, + UNIQUE(key) +); + +-- Index for the scheduler hot path: pop highest-priority pending task. +CREATE INDEX IF NOT EXISTS idx_tasks_pending + ON tasks (status, priority ASC, id ASC) + WHERE status = 'pending'; + +-- Completed and failed task history for queries and IO learning. +CREATE TABLE IF NOT EXISTS task_history ( + id INTEGER PRIMARY KEY, + task_type TEXT NOT NULL, + key TEXT NOT NULL, + priority INTEGER NOT NULL, + status TEXT NOT NULL, + payload BLOB, + expected_read_bytes INTEGER NOT NULL DEFAULT 0, + expected_write_bytes INTEGER NOT NULL DEFAULT 0, + actual_read_bytes INTEGER, + actual_write_bytes INTEGER, + retry_count INTEGER NOT NULL DEFAULT 0, + last_error TEXT, + created_at TEXT NOT NULL, + started_at TEXT, + completed_at TEXT NOT NULL DEFAULT (datetime('now')), + duration_ms INTEGER +); + +-- Index for IO learning: recent completions by task type. +CREATE INDEX IF NOT EXISTS idx_history_type_completed + ON task_history (task_type, completed_at DESC) + WHERE status = 'completed'; + +-- Index for task lookup by key (used by task dedup and status checks). +CREATE INDEX IF NOT EXISTS idx_history_key + ON task_history (key, completed_at DESC); + +-- Index for paginating and pruning history by completion time. +CREATE INDEX IF NOT EXISTS idx_history_completed + ON task_history (completed_at DESC); + +-- Index for filtering history by status (e.g. listing failed tasks). +CREATE INDEX IF NOT EXISTS idx_history_status + ON task_history (status, completed_at DESC); diff --git a/release-plz.toml b/release-plz.toml new file mode 100644 index 0000000..54b6e80 --- /dev/null +++ b/release-plz.toml @@ -0,0 +1,22 @@ +[workspace] +# Only release when merging the release PR, not on every push +release_always = false + +# Enable changelog generation from conventional commits +changelog_update = true + +# Create GitHub releases with tag +git_release_enable = true +git_tag_enable = true + +# Auto-detect pre-release versions (e.g. 0.1.0-alpha.1) and mark GitHub releases accordingly +git_release_type = "auto" + +# Check semver compatibility +semver_check = true + +# Label release PRs for easy identification +pr_labels = ["release"] + +# Create release PRs as drafts +pr_draft = true diff --git a/src/backpressure.rs b/src/backpressure.rs new file mode 100644 index 0000000..77dd4a7 --- /dev/null +++ b/src/backpressure.rs @@ -0,0 +1,162 @@ +use crate::priority::Priority; + +/// A source of pressure that signals the scheduler to slow down. +/// +/// Consumers implement this trait to feed external signals (API load, memory +/// pressure, queue depth, etc.) into the scheduler's throttle decisions. +pub trait PressureSource: Send + Sync + 'static { + /// Current pressure level between 0.0 (idle) and 1.0 (saturated). + fn pressure(&self) -> f32; + + /// Human-readable name for diagnostics and tracing. + fn name(&self) -> &str; +} + +/// Maps (priority, pressure) pairs to throttle decisions. +/// +/// Contains a list of thresholds: a task at or below a given priority +/// (higher numeric value = lower priority) is throttled when pressure +/// exceeds the associated limit. +/// +/// Thresholds are evaluated from lowest priority to highest. The first +/// matching rule applies. +pub struct ThrottlePolicy { + /// Sorted from lowest priority (highest numeric value) to highest. + /// Each entry: (priority_floor, pressure_limit). + thresholds: Vec<(Priority, f32)>, +} + +impl ThrottlePolicy { + /// Create a policy with custom thresholds. + /// + /// Each `(priority, limit)` means: any task with priority value >= `priority` + /// (i.e. lower or equal priority) is throttled when pressure > `limit`. + /// + /// Thresholds should be ordered from lowest priority to highest. + pub fn new(thresholds: Vec<(Priority, f32)>) -> Self { + Self { thresholds } + } + + /// Default three-tier policy matching Shoebox's original behavior: + /// - BACKGROUND (192+): pause at >50% pressure + /// - NORMAL (128+): pause at >75% pressure + /// - Everything else: never pause + pub fn default_three_tier() -> Self { + Self { + thresholds: vec![(Priority::BACKGROUND, 0.50), (Priority::NORMAL, 0.75)], + } + } + + /// Should a task at this priority be throttled given current pressure? + pub fn should_throttle(&self, priority: Priority, pressure: f32) -> bool { + for &(threshold_priority, pressure_limit) in &self.thresholds { + // If the task's priority value is >= threshold (lower or equal priority) + if priority.value() >= threshold_priority.value() && pressure > pressure_limit { + return true; + } + } + false + } +} + +/// Combines multiple pressure sources into a single aggregate signal. +/// +/// The aggregate pressure is the maximum across all sources — the system +/// is as pressured as its most constrained resource. +pub struct CompositePressure { + sources: Vec>, +} + +impl CompositePressure { + pub fn new() -> Self { + Self { + sources: Vec::new(), + } + } + + /// Add a pressure source. + pub fn add_source(&mut self, source: Box) { + self.sources.push(source); + } + + /// Aggregate pressure: max across all sources. + pub fn pressure(&self) -> f32 { + self.sources + .iter() + .map(|s| s.pressure()) + .fold(0.0f32, f32::max) + } + + /// Per-source breakdown for diagnostics. + pub fn breakdown(&self) -> Vec<(&str, f32)> { + self.sources + .iter() + .map(|s| (s.name(), s.pressure())) + .collect() + } +} + +impl Default for CompositePressure { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct FixedPressure { + value: f32, + name: &'static str, + } + + impl PressureSource for FixedPressure { + fn pressure(&self) -> f32 { + self.value + } + fn name(&self) -> &str { + self.name + } + } + + #[test] + fn default_policy_background_throttles() { + let policy = ThrottlePolicy::default_three_tier(); + + // Background at 60% pressure → throttled (>50%) + assert!(policy.should_throttle(Priority::BACKGROUND, 0.6)); + // Background at 40% → not throttled + assert!(!policy.should_throttle(Priority::BACKGROUND, 0.4)); + + // Normal at 60% → not throttled (<75%) + assert!(!policy.should_throttle(Priority::NORMAL, 0.6)); + // Normal at 80% → throttled + assert!(policy.should_throttle(Priority::NORMAL, 0.8)); + + // Realtime never throttled + assert!(!policy.should_throttle(Priority::REALTIME, 1.0)); + assert!(!policy.should_throttle(Priority::HIGH, 0.6)); + } + + #[test] + fn composite_takes_max() { + let mut comp = CompositePressure::new(); + comp.add_source(Box::new(FixedPressure { + value: 0.3, + name: "api", + })); + comp.add_source(Box::new(FixedPressure { + value: 0.7, + name: "disk", + })); + + assert!((comp.pressure() - 0.7).abs() < f32::EPSILON); + } + + #[test] + fn empty_composite_is_zero() { + let comp = CompositePressure::new(); + assert_eq!(comp.pressure(), 0.0); + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..ee3db34 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,49 @@ +//! # Taskmill +//! +//! Adaptive priority work scheduler with IO-aware concurrency and SQLite persistence. +//! +//! Taskmill provides a generic task scheduling system that: +//! - Persists tasks to SQLite so the queue survives restarts +//! - Schedules by priority (0 = highest, 255 = lowest) with named tiers +//! - Deduplicates tasks by key — submitting an already-queued key is a no-op +//! - Tracks expected and actual IO bytes per task for budget-based scheduling +//! - Monitors system CPU and disk throughput to adjust concurrency +//! - Supports composable backpressure from arbitrary external sources +//! - Preempts lower-priority work when high-priority tasks arrive +//! - Retries failed tasks at the same priority level +//! - Records completed/failed task history for queries and IO learning +//! - Emits lifecycle events including progress for UI integration (via broadcast channel) +//! - Supports graceful shutdown with configurable drain timeout +//! +//! # Feature flags +//! +//! - **`sysinfo-monitor`** (default): Enables the built-in `SysinfoSampler` for +//! cross-platform CPU and disk IO monitoring. Disable for mobile targets or +//! when providing a custom `ResourceSampler`. + +pub mod backpressure; +pub mod priority; +pub mod registry; +pub mod resource; +pub mod scheduler; +pub mod store; +pub mod task; + +// Convenience re-exports. +pub use backpressure::{CompositePressure, PressureSource, ThrottlePolicy}; +pub use priority::Priority; +pub use registry::{StateMap, TaskContext, TaskExecutor}; +pub use resource::sampler::{SamplerConfig, SmoothedReader}; +pub use resource::{ResourceReader, ResourceSampler, ResourceSnapshot}; +pub use scheduler::{ + EstimatedProgress, ProgressReporter, Scheduler, SchedulerBuilder, SchedulerConfig, + SchedulerEvent, SchedulerSnapshot, ShutdownMode, +}; +pub use store::{RetentionPolicy, StoreConfig, StoreError, TaskStore}; +pub use task::{ + generate_dedup_key, HistoryStatus, SubmitOutcome, TaskError, TaskHistoryRecord, TaskLookup, + TaskRecord, TaskResult, TaskStatus, TaskSubmission, TypeStats, TypedTask, +}; + +#[cfg(feature = "sysinfo-monitor")] +pub use resource::platform_sampler; diff --git a/src/priority.rs b/src/priority.rs new file mode 100644 index 0000000..aa9767d --- /dev/null +++ b/src/priority.rs @@ -0,0 +1,108 @@ +use std::cmp::Ordering; +use std::fmt; + +use serde::{Deserialize, Serialize}; + +/// Numeric priority level. Lower values = higher priority. +/// +/// Provides named constants for common tiers while allowing any value 0–255 +/// for fine-grained control. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Priority(u8); + +impl Priority { + /// User-interactive work. Never throttled, triggers preemption. + pub const REALTIME: Self = Self(0); + /// Functionality-blocking tasks. Throttled only under extreme load. + pub const HIGH: Self = Self(64); + /// Normal background operations. Yields to interactive work. + pub const NORMAL: Self = Self(128); + /// Low priority. Pauses under significant load. + pub const BACKGROUND: Self = Self(192); + /// Idle-only work. Runs only when system is otherwise idle. + pub const IDLE: Self = Self(255); + + /// Construct a priority from a raw value. 0 = highest, 255 = lowest. + pub const fn new(level: u8) -> Self { + Self(level) + } + + /// Raw numeric value. + pub const fn value(self) -> u8 { + self.0 + } +} + +/// Ordering: lower numeric value = higher priority = compares as Greater. +/// This makes `BinaryHeap` (max-heap) pop the highest-priority item first. +impl Ord for Priority { + fn cmp(&self, other: &Self) -> Ordering { + // Reverse: lower value is "greater" (higher priority). + other.0.cmp(&self.0) + } +} + +impl PartialOrd for Priority { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl fmt::Debug for Priority { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let label = match self.0 { + 0 => "REALTIME", + 64 => "HIGH", + 128 => "NORMAL", + 192 => "BACKGROUND", + 255 => "IDLE", + _ => return write!(f, "Priority({})", self.0), + }; + write!(f, "Priority::{label}") + } +} + +impl fmt::Display for Priority { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for Priority { + fn from(v: u8) -> Self { + Self(v) + } +} + +impl From for u8 { + fn from(p: Priority) -> Self { + p.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn realtime_is_highest() { + assert!(Priority::REALTIME > Priority::HIGH); + assert!(Priority::HIGH > Priority::NORMAL); + assert!(Priority::NORMAL > Priority::BACKGROUND); + assert!(Priority::BACKGROUND > Priority::IDLE); + } + + #[test] + fn custom_priorities_between_tiers() { + let p = Priority::new(96); + assert!(p < Priority::HIGH); // lower priority than HIGH + assert!(p > Priority::NORMAL); // higher priority than NORMAL + } + + #[test] + fn debug_named_tiers() { + assert_eq!(format!("{:?}", Priority::REALTIME), "Priority::REALTIME"); + assert_eq!(format!("{:?}", Priority::new(42)), "Priority(42)"); + } +} diff --git a/src/registry.rs b/src/registry.rs new file mode 100644 index 0000000..f903c80 --- /dev/null +++ b/src/registry.rs @@ -0,0 +1,276 @@ +use std::any::{Any, TypeId}; +use std::collections::HashMap; +use std::future::Future; +use std::sync::Arc; + +use tokio::sync::RwLock; +use tokio_util::sync::CancellationToken; + +use crate::scheduler::ProgressReporter; +use crate::task::{TaskError, TaskRecord, TaskResult, TypedTask}; + +// ── State Map ──────────────────────────────────────────────────────── + +/// Type-keyed map of shared application state. +/// +/// Multiple state types can be registered (one value per concrete type). +/// Executors retrieve them via [`TaskContext::state::()`]. This is the +/// same pattern used by Axum `Extensions` and Tauri `State`. +/// +/// The map supports post-build insertion via [`Scheduler::register_state`] +/// so that library consumers (e.g. shoebox inside a Tauri app) can inject +/// state after the scheduler has been constructed by the parent. +#[derive(Default)] +pub struct StateMap { + inner: RwLock>>, +} + +impl StateMap { + pub fn new() -> Self { + Self::default() + } + + /// Build a `StateMap` from pre-collected entries. + pub(crate) fn from_entries(entries: Vec<(TypeId, Arc)>) -> Self { + Self { + inner: RwLock::new(entries.into_iter().collect()), + } + } + + /// Insert a state value. Overwrites any previous value of the same type. + pub async fn insert(&self, value: Arc) { + self.inner.write().await.insert(TypeId::of::(), value); + } +} + +/// Snapshot of state for passing into a [`TaskContext`]. +/// +/// Created by cloning the inner map under the lock once, then used +/// lock-free for the lifetime of the task execution. +#[derive(Clone, Default)] +pub(crate) struct StateSnapshot { + entries: HashMap>, +} + +impl StateSnapshot { + pub fn get(&self) -> Option<&T> { + self.entries + .get(&TypeId::of::()) + .and_then(|arc| arc.downcast_ref::()) + } +} + +impl StateMap { + /// Take a lock-free snapshot for use inside a task context. + pub(crate) async fn snapshot(&self) -> StateSnapshot { + StateSnapshot { + entries: self.inner.read().await.clone(), + } + } +} + +// ── Task Context ───────────────────────────────────────────────────── + +/// Execution context passed to a [`TaskExecutor`]. +/// +/// Bundles the task record, cancellation token, progress reporter, and +/// optional application state into a single value. This keeps the executor +/// signature stable when new contextual data is added in the future. +pub struct TaskContext { + /// The full task record including payload, priority, and IO estimates. + pub record: TaskRecord, + /// Cancelled when the task is preempted. Check `token.is_cancelled()` + /// at natural yield points and return early if set. + pub token: CancellationToken, + /// Report progress back to the scheduler (0.0–1.0). + pub progress: ProgressReporter, + /// Shared application state set via [`SchedulerBuilder::app_state`]. + pub(crate) app_state: StateSnapshot, +} + +impl TaskContext { + /// Deserialize the payload as a [`TypedTask`]. + /// + /// Convenience wrapper around [`TaskRecord::deserialize_payload`] that + /// mirrors the typed submission API. + pub fn deserialize_typed(&self) -> Result, serde_json::Error> { + self.record.deserialize_payload() + } + + /// Retrieve shared application state registered via + /// [`SchedulerBuilder::app_state`] or [`Scheduler::register_state`]. + /// + /// Returns `None` if the type was never registered. Multiple types can + /// coexist — each is keyed by its concrete `TypeId`. + /// + /// # Example + /// + /// ```ignore + /// struct MyServices { db: DatabasePool, http: reqwest::Client } + /// + /// // In the executor: + /// let svc = ctx.state::().expect("app state not set"); + /// svc.db.query("...").await?; + /// ``` + pub fn state(&self) -> Option<&T> { + self.app_state.get::() + } +} + +/// Executes tasks of a registered type. +/// +/// Each executor is associated with a named task type (e.g. `"scan-l3"`, `"exif"`). +/// When the scheduler pops a task, it looks up the executor by `task_type` and +/// calls `execute` with a [`TaskContext`] containing the persisted record, +/// a cancellation token, and a progress reporter. +/// +/// Implementors deserialize the task's `payload` blob themselves — taskmill +/// treats it as opaque bytes. +/// +/// # Example +/// +/// ```ignore +/// use taskmill::{TaskExecutor, TaskContext, TaskResult, TaskError}; +/// +/// struct MyExecutor; +/// +/// impl TaskExecutor for MyExecutor { +/// async fn execute<'a>( +/// &'a self, +/// ctx: &'a TaskContext, +/// ) -> Result { +/// ctx.progress.report(0.5, Some("halfway".into())); +/// Ok(TaskResult { actual_read_bytes: 0, actual_write_bytes: 0 }) +/// } +/// } +/// ``` +pub trait TaskExecutor: Send + Sync + 'static { + /// Execute a task. + /// + /// - `ctx`: Execution context with the task record, cancellation token, + /// and progress reporter. + /// + /// On success, return actual IO bytes consumed. On failure, return a + /// `TaskError` indicating whether retry is appropriate. + fn execute<'a>( + &'a self, + ctx: &'a TaskContext, + ) -> impl Future> + Send + 'a; +} + +/// Registry mapping task type names to their executors. +/// +/// Built during application startup before the scheduler begins popping tasks. +/// After construction, the registry is immutable (shared via `Arc`). +pub struct TaskTypeRegistry { + types: HashMap>, +} + +/// Object-safe wrapper around [`TaskExecutor`] for dynamic dispatch in the registry. +/// +/// This trait exists because RPITIT (`impl Future`) in `TaskExecutor` is not +/// object-safe. The blanket impl below automatically wraps any `TaskExecutor` +/// so callers never interact with `ErasedExecutor` directly. +pub(crate) trait ErasedExecutor: Send + Sync + 'static { + fn execute_erased<'a>( + &'a self, + ctx: &'a TaskContext, + ) -> std::pin::Pin> + Send + 'a>>; +} + +impl ErasedExecutor for T { + fn execute_erased<'a>( + &'a self, + ctx: &'a TaskContext, + ) -> std::pin::Pin> + Send + 'a>> { + Box::pin(self.execute(ctx)) + } +} + +impl TaskTypeRegistry { + pub fn new() -> Self { + Self { + types: HashMap::new(), + } + } + + /// Register an executor for a named task type. + /// + /// Panics if the name is already registered (catch configuration errors + /// at startup, not at runtime). + pub fn register(&mut self, name: &str, executor: Arc) { + if self.types.contains_key(name) { + panic!("task type '{name}' already registered"); + } + self.types + .insert(name.to_string(), executor as Arc); + } + + /// Look up the executor for a task type. + pub(crate) fn get(&self, name: &str) -> Option<&Arc> { + self.types.get(name) + } + + /// All registered type names. + pub fn type_names(&self) -> Vec<&str> { + self.types.keys().map(|s| s.as_str()).collect() + } + + /// Number of registered types. + pub fn len(&self) -> usize { + self.types.len() + } + + pub fn is_empty(&self) -> bool { + self.types.is_empty() + } + + /// Register a pre-erased executor. Used by the builder which already holds + /// `Arc`. + pub(crate) fn register_erased(&mut self, name: &str, executor: Arc) { + if self.types.contains_key(name) { + panic!("task type '{name}' already registered"); + } + self.types.insert(name.to_string(), executor); + } +} + +impl Default for TaskTypeRegistry { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + struct NoopExecutor; + + impl TaskExecutor for NoopExecutor { + async fn execute<'a>(&'a self, _ctx: &'a TaskContext) -> Result { + Ok(TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }) + } + } + + #[test] + fn register_and_lookup() { + let mut reg = TaskTypeRegistry::new(); + reg.register("test-type", Arc::new(NoopExecutor)); + + assert!(reg.get("test-type").is_some()); + assert!(reg.get("unknown").is_none()); + assert_eq!(reg.len(), 1); + } + + #[test] + #[should_panic(expected = "already registered")] + fn duplicate_registration_panics() { + let mut reg = TaskTypeRegistry::new(); + reg.register("dup", Arc::new(NoopExecutor)); + reg.register("dup", Arc::new(NoopExecutor)); + } +} diff --git a/src/resource/mod.rs b/src/resource/mod.rs new file mode 100644 index 0000000..5b7aae4 --- /dev/null +++ b/src/resource/mod.rs @@ -0,0 +1,63 @@ +pub mod sampler; + +#[cfg(feature = "sysinfo-monitor")] +pub mod sysinfo_monitor; + +use serde::{Deserialize, Serialize}; + +/// Point-in-time snapshot of system resource utilization. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ResourceSnapshot { + /// CPU utilization 0.0 to 1.0 (EWMA-smoothed). + pub cpu_usage: f64, + /// Disk read throughput in bytes/sec (EWMA-smoothed). + pub io_read_bytes_per_sec: f64, + /// Disk write throughput in bytes/sec (EWMA-smoothed). + pub io_write_bytes_per_sec: f64, +} + +impl Default for ResourceSnapshot { + fn default() -> Self { + Self { + cpu_usage: 0.0, + io_read_bytes_per_sec: 0.0, + io_write_bytes_per_sec: 0.0, + } + } +} + +/// Trait for sampling raw system resources. +/// +/// Implementations read platform-specific counters and return raw deltas. +/// The sampler loop handles EWMA smoothing separately. +/// +/// To override the built-in monitor (e.g. for container cgroup-aware monitoring), +/// implement this trait and pass it to the scheduler. +pub trait ResourceSampler: Send + Sync + 'static { + /// Take a raw sample. Called periodically by the sampler loop. + /// Returns a snapshot with absolute values (not smoothed — the sampler + /// applies EWMA). + fn sample(&mut self) -> ResourceSnapshot; +} + +/// Read-only access to the latest smoothed resource snapshot. +/// +/// This is the interface consumed by the scheduler for IO budget decisions. +/// The sampler loop updates it; the scheduler reads it. Separating this from +/// [`ResourceSampler`] keeps the public API clean — consumers only see the +/// latest reading, never the sampling mechanics. +pub trait ResourceReader: Send + Sync + 'static { + /// The most recent smoothed snapshot. + fn latest(&self) -> ResourceSnapshot; +} + +/// Create the platform-appropriate sampler. +/// +/// Uses `sysinfo` for cross-platform CPU and disk IO monitoring on +/// Linux, macOS, and Windows. +/// +/// Only available with the `sysinfo-monitor` feature (enabled by default). +#[cfg(feature = "sysinfo-monitor")] +pub fn platform_sampler() -> Box { + Box::new(sysinfo_monitor::SysinfoSampler::new()) +} diff --git a/src/resource/sampler.rs b/src/resource/sampler.rs new file mode 100644 index 0000000..160e686 --- /dev/null +++ b/src/resource/sampler.rs @@ -0,0 +1,153 @@ +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::RwLock; +use tokio_util::sync::CancellationToken; + +use super::{ResourceReader, ResourceSampler, ResourceSnapshot}; + +/// Configuration for the background resource sampling loop. +pub struct SamplerConfig { + /// How often to sample system resources. Default: 1 second. + pub interval: Duration, + /// EWMA smoothing factor (alpha). Default: 0.3. + /// Higher = more responsive to changes, lower = smoother. + pub ewma_alpha: f64, +} + +impl Default for SamplerConfig { + fn default() -> Self { + Self { + interval: Duration::from_secs(1), + ewma_alpha: 0.3, + } + } +} + +/// Apply EWMA smoothing: new_value = alpha * raw + (1 - alpha) * old. +fn ewma(old: f64, raw: f64, alpha: f64) -> f64 { + if old == 0.0 { + raw // First sample — no history to blend with. + } else { + alpha * raw + (1.0 - alpha) * old + } +} + +/// Shared, lock-protected store for the latest smoothed snapshot. +/// +/// The sampler loop writes to this; the scheduler reads from it. +/// Uses `RwLock` so readers never block each other. +#[derive(Clone)] +pub struct SmoothedReader { + inner: Arc>, +} + +impl SmoothedReader { + pub fn new() -> Self { + Self { + inner: Arc::new(RwLock::new(ResourceSnapshot::default())), + } + } + + async fn update(&self, snapshot: ResourceSnapshot) { + *self.inner.write().await = snapshot; + } +} + +impl Default for SmoothedReader { + fn default() -> Self { + Self::new() + } +} + +impl ResourceReader for SmoothedReader { + fn latest(&self) -> ResourceSnapshot { + // Use try_read to avoid async in a sync trait method. + // If the lock is held by the writer, return the default (zero) snapshot + // which makes the scheduler skip IO budgeting for that cycle. + self.inner + .try_read() + .map(|guard| guard.clone()) + .unwrap_or_default() + } +} + +/// Run the resource sampling loop in the background. +/// +/// Periodically calls `sampler.sample()`, applies EWMA smoothing, and +/// stores the result in the `reader`. The scheduler reads +/// `reader.latest()` when making IO budget decisions. +pub async fn run_sampler( + sampler: Arc>>, + reader: SmoothedReader, + config: SamplerConfig, + token: CancellationToken, +) { + tracing::debug!( + interval_ms = config.interval.as_millis() as u64, + alpha = config.ewma_alpha, + "resource sampler started" + ); + + let mut smoothed = ResourceSnapshot::default(); + + loop { + tokio::select! { + _ = token.cancelled() => { + tracing::debug!("resource sampler shutting down"); + break; + } + _ = tokio::time::sleep(config.interval) => { + let raw = sampler.lock().await.sample(); + + smoothed.cpu_usage = ewma(smoothed.cpu_usage, raw.cpu_usage, config.ewma_alpha); + smoothed.io_read_bytes_per_sec = ewma( + smoothed.io_read_bytes_per_sec, + raw.io_read_bytes_per_sec, + config.ewma_alpha, + ); + smoothed.io_write_bytes_per_sec = ewma( + smoothed.io_write_bytes_per_sec, + raw.io_write_bytes_per_sec, + config.ewma_alpha, + ); + + reader.update(smoothed.clone()).await; + + tracing::trace!( + cpu = format!("{:.1}%", smoothed.cpu_usage * 100.0), + read_mbps = format!("{:.1}", smoothed.io_read_bytes_per_sec / 1_048_576.0), + write_mbps = format!("{:.1}", smoothed.io_write_bytes_per_sec / 1_048_576.0), + "resource sample" + ); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ewma_first_sample_is_raw() { + assert_eq!(ewma(0.0, 42.0, 0.3), 42.0); + } + + #[test] + fn ewma_blends_with_history() { + let result = ewma(100.0, 0.0, 0.3); + // 0.3 * 0 + 0.7 * 100 = 70 + assert!((result - 70.0).abs() < 0.01); + } + + #[test] + fn ewma_converges() { + let mut v = 0.0; + for _ in 0..50 { + v = ewma(v, 100.0, 0.3); + } + // Should converge to ~100 + assert!((v - 100.0).abs() < 1.0); + } +} diff --git a/src/resource/sysinfo_monitor.rs b/src/resource/sysinfo_monitor.rs new file mode 100644 index 0000000..a2e7592 --- /dev/null +++ b/src/resource/sysinfo_monitor.rs @@ -0,0 +1,94 @@ +use std::time::Instant; + +use sysinfo::{Disks, System}; + +use crate::resource::{ResourceSampler, ResourceSnapshot}; + +/// Cross-platform resource sampler using the `sysinfo` crate. +/// +/// Works on Linux, macOS, and Windows. Tracks CPU utilization and +/// aggregate disk IO throughput across all mounted disks. +pub struct SysinfoSampler { + sys: System, + disks: Disks, + prev_read_bytes: u64, + prev_write_bytes: u64, + prev_sample: Option, +} + +impl SysinfoSampler { + pub fn new() -> Self { + let mut sys = System::new(); + sys.refresh_cpu_usage(); + + let disks = Disks::new_with_refreshed_list(); + + // Take initial disk totals so first delta is meaningful. + let (read, write) = disk_totals(&disks); + + Self { + sys, + disks, + prev_read_bytes: read, + prev_write_bytes: write, + prev_sample: Some(Instant::now()), + } + } +} + +impl Default for SysinfoSampler { + fn default() -> Self { + Self::new() + } +} + +impl ResourceSampler for SysinfoSampler { + fn sample(&mut self) -> ResourceSnapshot { + // CPU: sysinfo needs two refresh calls to compute usage delta. + self.sys.refresh_cpu_usage(); + let cpu_usage = self.sys.global_cpu_usage() as f64 / 100.0; + + // Disk IO: compute bytes/sec since last sample. + self.disks.refresh(true); + let (read_bytes, write_bytes) = disk_totals(&self.disks); + let now = Instant::now(); + + let (read_bps, write_bps) = if let Some(prev_ts) = self.prev_sample { + let elapsed = now.duration_since(prev_ts).as_secs_f64(); + if elapsed > 0.0 { + let read_delta = read_bytes.saturating_sub(self.prev_read_bytes); + let write_delta = write_bytes.saturating_sub(self.prev_write_bytes); + (read_delta as f64 / elapsed, write_delta as f64 / elapsed) + } else { + (0.0, 0.0) + } + } else { + (0.0, 0.0) + }; + + self.prev_read_bytes = read_bytes; + self.prev_write_bytes = write_bytes; + self.prev_sample = Some(now); + + ResourceSnapshot { + cpu_usage, + io_read_bytes_per_sec: read_bps, + io_write_bytes_per_sec: write_bps, + } + } +} + +/// Sum read/write bytes across all disks. +fn disk_totals(disks: &Disks) -> (u64, u64) { + let mut total_read = 0u64; + let mut total_write = 0u64; + for disk in disks.list() { + // sysinfo::Disk exposes usage(); total/available space but not IO counters + // directly. We use the disk-level process IO as a proxy. + // Note: sysinfo 0.33+ tracks disk IO via the Disks API on supported platforms. + let usage = disk.usage(); + total_read += usage.read_bytes; + total_write += usage.written_bytes; + } + (total_read, total_write) +} diff --git a/src/scheduler/dispatch.rs b/src/scheduler/dispatch.rs new file mode 100644 index 0000000..e55b173 --- /dev/null +++ b/src/scheduler/dispatch.rs @@ -0,0 +1,310 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use tokio::sync::Mutex; +use tokio_util::sync::CancellationToken; + +use crate::priority::Priority; +use crate::registry::TaskContext; +use crate::store::TaskStore; +use crate::task::TaskRecord; + +use super::progress::ProgressReporter; +use super::SchedulerEvent; + +// ── Active Task ──────────────────────────────────────────────────── + +/// Handle to a running task for preemption and progress tracking. +pub(crate) struct ActiveTask { + pub record: TaskRecord, + pub token: CancellationToken, + /// Last reported progress from the executor (0.0 to 1.0). + pub reported_progress: Option, + /// When the last progress report was received. + pub reported_at: Option>, +} + +// ── Active Task Map ──────────────────────────────────────────────── + +/// Thread-safe map of currently running tasks. +/// +/// Wraps the active-task bookkeeping that was previously inlined in +/// `Scheduler`, making preemption and progress queries independently +/// testable. +#[derive(Clone)] +pub(crate) struct ActiveTaskMap { + inner: Arc>>, +} + +impl ActiveTaskMap { + pub fn new() -> Self { + Self { + inner: Arc::new(Mutex::new(HashMap::new())), + } + } + + pub async fn count(&self) -> usize { + self.inner.lock().await.len() + } + + pub async fn insert(&self, id: i64, task: ActiveTask) { + self.inner.lock().await.insert(id, task); + } + + pub async fn remove(&self, id: i64) -> Option { + self.inner.lock().await.remove(&id) + } + + /// Snapshot of all active task records. + pub async fn records(&self) -> Vec { + self.inner + .lock() + .await + .values() + .map(|at| at.record.clone()) + .collect() + } + + /// Snapshot of progress data for all active tasks. + pub async fn progress_snapshots( + &self, + ) -> Vec<( + TaskRecord, + Option, + Option>, + )> { + self.inner + .lock() + .await + .values() + .map(|at| (at.record.clone(), at.reported_progress, at.reported_at)) + .collect() + } + + /// Update reported progress for a specific task. + pub async fn update_progress(&self, task_id: i64, percent: f32) { + let mut map = self.inner.lock().await; + if let Some(at) = map.get_mut(&task_id) { + at.reported_progress = Some(percent); + at.reported_at = Some(chrono::Utc::now()); + } + } + + /// Preempt active tasks with priority lower than the incoming priority. + /// + /// Cancels their tokens, pauses them in the store, and emits + /// `SchedulerEvent::Preempted`. Returns the IDs of preempted tasks. + pub async fn preempt_below( + &self, + incoming_priority: Priority, + store: &TaskStore, + event_tx: &tokio::sync::broadcast::Sender, + ) -> Vec { + let mut active = self.inner.lock().await; + let to_preempt: Vec = active + .iter() + .filter(|(_, at)| at.record.priority.value() > incoming_priority.value()) + .map(|(id, _)| *id) + .collect(); + + let mut preempted = Vec::new(); + for id in to_preempt { + if let Some(at) = active.remove(&id) { + tracing::info!( + task_id = id, + task_type = at.record.task_type, + "preempting task for higher-priority work" + ); + at.token.cancel(); + let _ = store.pause(id).await; + let _ = event_tx.send(SchedulerEvent::Preempted { + task_id: id, + task_type: at.record.task_type.clone(), + key: at.record.key.clone(), + }); + preempted.push(id); + } + } + + preempted + } + + /// Check whether any active task would preempt work at the given priority. + pub async fn has_preemptors_for( + &self, + priority: Priority, + preempt_threshold: Priority, + ) -> bool { + let active = self.inner.lock().await; + active.values().any(|at| { + at.record.priority.value() <= preempt_threshold.value() + && at.record.priority.value() < priority.value() + }) + } + + /// Cancel all active tasks (for shutdown). + pub async fn cancel_all(&self) { + let mut active = self.inner.lock().await; + for (_, at) in active.drain() { + at.token.cancel(); + } + } + + /// Pause all active tasks: cancel their tokens and move them to paused + /// state in the store. Returns the number of tasks paused. + pub async fn pause_all( + &self, + store: &TaskStore, + event_tx: &tokio::sync::broadcast::Sender, + ) -> usize { + let mut active = self.inner.lock().await; + let count = active.len(); + for (id, at) in active.drain() { + at.token.cancel(); + let _ = store.pause(id).await; + let _ = event_tx.send(SchedulerEvent::Preempted { + task_id: id, + task_type: at.record.task_type.clone(), + key: at.record.key.clone(), + }); + } + count + } +} + +// ── Spawn ────────────────────────────────────────────────────────── + +/// Spawn a task executor and wire up completion/failure handling. +/// +/// Inserts the task into the active map, starts a progress listener, +/// and spawns the executor. +pub(crate) async fn spawn_task( + task: TaskRecord, + executor: Arc, + store: TaskStore, + active: ActiveTaskMap, + event_tx: tokio::sync::broadcast::Sender, + max_retries: i32, + app_state: crate::registry::StateSnapshot, +) { + let child_token = CancellationToken::new(); + + // Insert into active map before spawning to avoid races. + active + .insert( + task.id, + ActiveTask { + record: task.clone(), + token: child_token.clone(), + reported_progress: None, + reported_at: None, + }, + ) + .await; + + // Build execution context. + let ctx = TaskContext { + record: task.clone(), + token: child_token.clone(), + progress: ProgressReporter::new( + task.id, + task.task_type.clone(), + task.key.clone(), + event_tx.clone(), + ), + app_state, + }; + + // Emit dispatched event. + let _ = event_tx.send(SchedulerEvent::Dispatched { + task_id: task.id, + task_type: task.task_type.clone(), + key: task.key.clone(), + }); + + // Spawn progress listener — bridges broadcast events into the active map. + let active_for_progress = active.clone(); + let mut progress_rx = event_tx.subscribe(); + let progress_task_id = task.id; + tokio::spawn(async move { + while let Ok(evt) = progress_rx.recv().await { + if let SchedulerEvent::Progress { + task_id, percent, .. + } = evt + { + if task_id == progress_task_id { + active_for_progress.update_progress(task_id, percent).await; + if percent >= 1.0 { + break; + } + } + } + } + }); + + // Spawn executor. + let token_for_spawn = child_token.clone(); + tokio::spawn(async move { + let task_id = task.id; + let result = executor.execute_erased(&ctx).await; + + // Drop the context (and its progress reporter) — executor is done. + drop(ctx); + + match result { + Ok(tr) => { + if let Err(e) = store.complete(task_id, &tr).await { + tracing::error!(task_id, error = %e, "failed to record task completion"); + } + // Remove from active tracking AFTER the store write completes. + // This keeps the concurrency slot occupied, preventing the + // scheduler from dispatching new tasks that would create + // concurrent SQLite write transactions (which cause SQLITE_BUSY). + active.remove(task_id).await; + let _ = event_tx.send(SchedulerEvent::Completed { + task_id, + task_type: task.task_type.clone(), + key: task.key.clone(), + }); + } + Err(te) => { + // If cancelled (preempted), the scheduler already paused it. + if token_for_spawn.is_cancelled() { + active.remove(task_id).await; + return; + } + let will_retry = te.retryable && task.retry_count < max_retries; + tracing::warn!( + task_id, + task_type = task.task_type, + error = %te.message, + retryable = te.retryable, + will_retry, + "task failed" + ); + if let Err(e) = store + .fail( + task_id, + &te.message, + te.retryable, + max_retries, + te.actual_read_bytes, + te.actual_write_bytes, + ) + .await + { + tracing::error!(task_id, error = %e, "failed to record task failure"); + } + // Remove from active tracking AFTER the store write completes. + active.remove(task_id).await; + let _ = event_tx.send(SchedulerEvent::Failed { + task_id, + task_type: task.task_type.clone(), + key: task.key.clone(), + error: te.message, + will_retry, + }); + } + } + }); +} diff --git a/src/scheduler/gate.rs b/src/scheduler/gate.rs new file mode 100644 index 0000000..af04120 --- /dev/null +++ b/src/scheduler/gate.rs @@ -0,0 +1,181 @@ +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; + +use crate::backpressure::{CompositePressure, ThrottlePolicy}; +use crate::resource::ResourceReader; +use crate::store::{StoreError, TaskStore}; +use crate::task::TaskRecord; + +/// Boxed future returned by [`DispatchGate`] methods. +type BoxFuture<'a, T> = Pin + Send + 'a>>; + +// ── Gate Context ─────────────────────────────────────────────────── + +/// Context provided to a [`DispatchGate`] for admission decisions. +/// +/// Built by the scheduler each dispatch cycle so gate implementations +/// can query store state and resource snapshots without owning them. +pub struct GateContext<'a> { + /// The task store — available for queries like running IO totals. + pub store: &'a TaskStore, + /// The current resource reader, if monitoring is enabled. + pub resource_reader: Option<&'a Arc>, +} + +// ── Dispatch Gate ────────────────────────────────────────────────── + +/// Decides whether a popped task should be dispatched or requeued. +/// +/// The scheduler calls [`admit`](DispatchGate::admit) after popping a +/// task from the store but before spawning the executor. Returning +/// `Ok(false)` causes the task to be requeued for a later cycle. +/// +/// The default [`DefaultDispatchGate`] applies backpressure throttling +/// and IO-budget checks. Custom implementations can add per-type rate +/// limiting, cost-model gating, feature flags, etc. +/// +/// # Example +/// +/// ```ignore +/// use taskmill::scheduler::gate::{DispatchGate, GateContext}; +/// use taskmill::store::StoreError; +/// use taskmill::task::TaskRecord; +/// +/// struct AlwaysAdmit; +/// +/// impl DispatchGate for AlwaysAdmit { +/// fn admit<'a>( +/// &'a self, +/// _task: &'a TaskRecord, +/// _ctx: &'a GateContext<'a>, +/// ) -> std::pin::Pin> + Send + 'a>> { +/// Box::pin(async { Ok(true) }) +/// } +/// } +/// ``` +pub trait DispatchGate: Send + Sync + 'static { + /// Check whether `task` should be dispatched given the current context. + /// + /// Return `Ok(true)` to dispatch, `Ok(false)` to requeue. + fn admit<'a>( + &'a self, + task: &'a TaskRecord, + ctx: &'a GateContext<'a>, + ) -> BoxFuture<'a, Result>; + + /// Current aggregate pressure (0.0–1.0). Returns 0.0 by default. + fn pressure<'a>(&'a self) -> BoxFuture<'a, f32> { + Box::pin(async { 0.0 }) + } + + /// Per-source pressure breakdown for diagnostics. Empty by default. + fn pressure_breakdown<'a>(&'a self) -> BoxFuture<'a, Vec<(String, f32)>> { + Box::pin(async { Vec::new() }) + } +} + +// ── Default Gate ─────────────────────────────────────────────────── + +/// Default gate: backpressure throttling + IO budget. +/// +/// This is what the scheduler uses unless you provide a custom gate via +/// [`SchedulerBuilder::dispatch_gate`](super::SchedulerBuilder::dispatch_gate). +pub struct DefaultDispatchGate { + pub(crate) pressure: tokio::sync::Mutex, + pub(crate) policy: ThrottlePolicy, +} + +impl DefaultDispatchGate { + pub fn new(pressure: CompositePressure, policy: ThrottlePolicy) -> Self { + Self { + pressure: tokio::sync::Mutex::new(pressure), + policy, + } + } +} + +impl DispatchGate for DefaultDispatchGate { + fn admit<'a>( + &'a self, + task: &'a TaskRecord, + ctx: &'a GateContext<'a>, + ) -> BoxFuture<'a, Result> { + Box::pin(async move { + // Backpressure check. + let current_pressure = self.pressure.lock().await.pressure(); + if self.policy.should_throttle(task.priority, current_pressure) { + tracing::trace!( + priority = task.priority.value(), + pressure = current_pressure, + "task throttled by backpressure — requeuing" + ); + return Ok(false); + } + + // IO budget check. + if !has_io_headroom(task, ctx).await? { + tracing::trace!( + task_type = task.task_type, + expected_read = task.expected_read_bytes, + expected_write = task.expected_write_bytes, + "task deferred — IO budget exhausted — requeuing" + ); + return Ok(false); + } + + Ok(true) + }) + } + + fn pressure<'a>(&'a self) -> BoxFuture<'a, f32> { + Box::pin(async { self.pressure.lock().await.pressure() }) + } + + fn pressure_breakdown<'a>(&'a self) -> BoxFuture<'a, Vec<(String, f32)>> { + Box::pin(async { + self.pressure + .lock() + .await + .breakdown() + .into_iter() + .map(|(name, val)| (name.to_owned(), val)) + .collect() + }) + } +} + +// ── IO Budget ────────────────────────────────────────────────────── + +/// Check if there is IO headroom for a task given current running IO +/// and system capacity. +/// +/// This is a utility function that custom [`DispatchGate`] implementations +/// can reuse if they want IO-budget awareness alongside their own logic. +pub async fn has_io_headroom(task: &TaskRecord, ctx: &GateContext<'_>) -> Result { + let Some(reader) = ctx.resource_reader else { + // No monitor configured — always allow. + return Ok(true); + }; + + let snapshot = reader.latest(); + // If we have no IO data yet, allow the task. + if snapshot.io_read_bytes_per_sec == 0.0 && snapshot.io_write_bytes_per_sec == 0.0 { + return Ok(true); + } + + let (running_read, running_write) = ctx.store.running_io_totals().await?; + + // Simple heuristic: if running tasks' expected IO already exceeds + // 80% of observed system throughput (per second × 2s budget window), + // defer new work. + let read_capacity = snapshot.io_read_bytes_per_sec * 2.0; + let write_capacity = snapshot.io_write_bytes_per_sec * 2.0; + + let read_ok = read_capacity == 0.0 + || (running_read + task.expected_read_bytes) as f64 <= read_capacity * 0.8; + let write_ok = write_capacity == 0.0 + || (running_write + task.expected_write_bytes) as f64 <= write_capacity * 0.8; + + Ok(read_ok && write_ok) +} diff --git a/src/scheduler/mod.rs b/src/scheduler/mod.rs new file mode 100644 index 0000000..84ce287 --- /dev/null +++ b/src/scheduler/mod.rs @@ -0,0 +1,1526 @@ +pub(crate) mod dispatch; +pub(crate) mod gate; +pub mod progress; + +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering as AtomicOrdering}; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use tokio::sync::{Mutex, Notify}; +use tokio::time::Duration; +use tokio_util::sync::CancellationToken; + +use crate::backpressure::{CompositePressure, ThrottlePolicy}; +use crate::priority::Priority; +use crate::registry::{TaskExecutor, TaskTypeRegistry}; +use crate::resource::sampler::{SamplerConfig, SmoothedReader}; +use crate::resource::{ResourceReader, ResourceSampler}; +use crate::store::{StoreConfig, StoreError, TaskStore}; +use crate::task::{generate_dedup_key, SubmitOutcome, TaskLookup, TaskSubmission, TypedTask}; + +use dispatch::ActiveTaskMap; +use gate::{DefaultDispatchGate, GateContext}; + +pub use progress::{EstimatedProgress, ProgressReporter}; + +// ── Snapshot ──────────────────────────────────────────────────────── + +/// Single-call status snapshot for dashboard UIs. +/// +/// Captures queue depths, running tasks, progress, and backpressure in +/// one serializable struct — ideal for returning from a Tauri command. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SchedulerSnapshot { + /// Tasks currently executing. + pub running: Vec, + /// Number of tasks waiting to be dispatched. + pub pending_count: i64, + /// Number of tasks paused (preempted). + pub paused_count: i64, + /// Progress estimates for every running task. + pub progress: Vec, + /// Aggregate backpressure (0.0–1.0). + pub pressure: f32, + /// Per-source pressure breakdown for diagnostics. + pub pressure_breakdown: Vec<(String, f32)>, + /// Current maximum concurrency setting. + pub max_concurrency: usize, + /// Whether the scheduler is globally paused. + pub is_paused: bool, +} + +// ── Events ────────────────────────────────────────────────────────── + +/// Events emitted by the scheduler for UI integration and observability. +/// +/// Subscribe via the `tokio::sync::broadcast::Receiver` returned by +/// [`Scheduler::subscribe`] or passed through the builder. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "type", content = "data")] +pub enum SchedulerEvent { + /// A task was dispatched and is now running. + Dispatched { + task_id: i64, + task_type: String, + key: String, + }, + /// A task completed successfully. + Completed { + task_id: i64, + task_type: String, + key: String, + }, + /// A task failed (may be retried or permanently failed). + Failed { + task_id: i64, + task_type: String, + key: String, + error: String, + will_retry: bool, + }, + /// A task was preempted by higher-priority work. + Preempted { + task_id: i64, + task_type: String, + key: String, + }, + /// A task was cancelled by the application. + Cancelled { + task_id: i64, + task_type: String, + key: String, + }, + /// Progress update from a running task. + Progress { + task_id: i64, + task_type: String, + key: String, + /// Progress percentage (0.0 to 1.0). + percent: f32, + /// Optional human-readable message from the executor. + message: Option, + }, + /// The scheduler was globally paused via [`Scheduler::pause_all`]. + Paused, + /// The scheduler was resumed via [`Scheduler::resume_all`]. + Resumed, +} + +// ── Config ────────────────────────────────────────────────────────── + +/// How the scheduler behaves during shutdown. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ShutdownMode { + /// Cancel all running tasks immediately (default). + Hard, + /// Stop accepting new dispatches, wait for running tasks to complete + /// (up to the given timeout), then cancel any remaining. + Graceful(Duration), +} + +/// Scheduler configuration. +pub struct SchedulerConfig { + /// Maximum concurrent running tasks. Adjusted dynamically via + /// [`Scheduler::set_max_concurrency`]. + pub max_concurrency: usize, + /// Maximum retries before permanent failure. Default: 3. + pub max_retries: i32, + /// Priority threshold: tasks at or above this priority (lower numeric value) + /// trigger preemption of lower-priority running tasks. + pub preempt_priority: Priority, + /// Interval between scheduler polls when idle. Default: 500ms. + pub poll_interval: Duration, + /// How many recent tasks to consider for IO throughput estimation. + pub throughput_sample_size: i32, + /// Shutdown behavior. Default: Hard. + pub shutdown_mode: ShutdownMode, +} + +impl Default for SchedulerConfig { + fn default() -> Self { + Self { + max_concurrency: 4, + max_retries: 3, + preempt_priority: Priority::REALTIME, + poll_interval: Duration::from_millis(500), + throughput_sample_size: 20, + shutdown_mode: ShutdownMode::Hard, + } + } +} + +// ── Scheduler ─────────────────────────────────────────────────────── + +/// Shared inner state behind `Arc` so `Scheduler` can be `Clone`. +#[allow(dead_code)] +struct SchedulerInner { + store: TaskStore, + max_concurrency: AtomicUsize, + max_retries: i32, + preempt_priority: Priority, + poll_interval: Duration, + throughput_sample_size: i32, + shutdown_mode: ShutdownMode, + registry: Arc, + gate: Box, + resource_reader: Mutex>>, + /// In-memory tracking of active tasks and their cancellation tokens. + active: ActiveTaskMap, + /// Broadcast channel for lifecycle events. + event_tx: tokio::sync::broadcast::Sender, + /// Token to cancel the background resource sampler (if started). + sampler_token: CancellationToken, + /// Type-keyed application state passed to every executor via [`TaskContext::state`]. + app_state: Arc, + /// Global pause flag — when `true`, the run loop skips dispatching. + paused: AtomicBool, + /// Wakes the run loop when new work is submitted or the scheduler is resumed. + work_notify: Notify, +} + +/// IO-aware priority scheduler. +/// +/// Coordinates task execution by: +/// 1. Popping highest-priority pending tasks from the SQLite store +/// 2. Checking IO budget against running task estimates and system capacity +/// 3. Applying backpressure throttling based on external pressure sources +/// 4. Preempting lower-priority tasks when high-priority work arrives +/// 5. Managing retries and failure recording +/// 6. Emitting lifecycle events for UI integration +/// +/// `Scheduler` is `Clone` — each clone shares the same underlying state. +/// This makes it easy to hold in `tauri::State` or share across +/// async tasks. +#[derive(Clone)] +pub struct Scheduler { + inner: Arc, +} + +impl Scheduler { + pub fn new( + store: TaskStore, + config: SchedulerConfig, + registry: Arc, + pressure: CompositePressure, + policy: ThrottlePolicy, + ) -> Self { + let gate = Box::new(DefaultDispatchGate::new(pressure, policy)); + Self::with_gate( + store, + config, + registry, + gate, + Arc::new(crate::registry::StateMap::new()), + ) + } + + /// Create a scheduler with a custom dispatch gate. + fn with_gate( + store: TaskStore, + config: SchedulerConfig, + registry: Arc, + gate: Box, + app_state: Arc, + ) -> Self { + let (event_tx, _) = tokio::sync::broadcast::channel(256); + Self { + inner: Arc::new(SchedulerInner { + store, + max_concurrency: AtomicUsize::new(config.max_concurrency), + max_retries: config.max_retries, + preempt_priority: config.preempt_priority, + poll_interval: config.poll_interval, + throughput_sample_size: config.throughput_sample_size, + shutdown_mode: config.shutdown_mode, + registry, + gate, + resource_reader: Mutex::new(None), + active: ActiveTaskMap::new(), + event_tx, + sampler_token: CancellationToken::new(), + app_state, + paused: AtomicBool::new(false), + work_notify: Notify::new(), + }), + } + } + + /// Create a [`SchedulerBuilder`] for ergonomic construction. + pub fn builder() -> SchedulerBuilder { + SchedulerBuilder::new() + } + + /// Subscribe to scheduler lifecycle events. + /// + /// Returns a broadcast receiver. Events are emitted on task dispatch, + /// completion, failure, preemption, cancellation, and progress. Useful for + /// bridging to a Tauri frontend or updating UI state. + pub fn subscribe(&self) -> tokio::sync::broadcast::Receiver { + self.inner.event_tx.subscribe() + } + + /// Set the resource reader for IO-aware scheduling. + pub async fn set_resource_reader(&self, reader: Arc) { + *self.inner.resource_reader.lock().await = Some(reader); + } + + /// Get a reference to the underlying store for direct queries. + pub fn store(&self) -> &TaskStore { + &self.inner.store + } + + /// Register shared application state after the scheduler has been built. + /// + /// This is useful when library code (e.g. shoebox) needs to inject its + /// own state into a scheduler that was constructed by a parent + /// application. Multiple types can coexist — each is keyed by `TypeId`. + pub async fn register_state(&self, state: Arc) { + self.inner.app_state.insert(state).await; + } + + /// Submit a task. + /// + /// If the task's priority meets the preemption threshold, running tasks + /// with lower priority are preempted (their cancellation tokens are cancelled + /// and they are paused in the store). + pub async fn submit(&self, sub: &TaskSubmission) -> Result { + let outcome = self.inner.store.submit(sub).await?; + + if !matches!(outcome, SubmitOutcome::Duplicate) { + // Preempt if this is a high-priority task. + if sub.priority.value() <= self.inner.preempt_priority.value() { + self.inner + .active + .preempt_below(sub.priority, &self.inner.store, &self.inner.event_tx) + .await; + } + + // Wake the scheduler loop so it picks up the new/upgraded task. + self.inner.work_notify.notify_one(); + } + + Ok(outcome) + } + + /// Submit multiple tasks in a single SQLite transaction. + /// + /// Preemption is triggered once at the end if any inserted or upgraded + /// task has high enough priority. + pub async fn submit_batch( + &self, + submissions: &[TaskSubmission], + ) -> Result, StoreError> { + let results = self.inner.store.submit_batch(submissions).await?; + + // Find the highest (lowest numeric value) priority among tasks that + // were inserted or had their priority upgraded. + let best_priority = submissions + .iter() + .zip(results.iter()) + .filter(|(_, outcome)| !matches!(outcome, SubmitOutcome::Duplicate)) + .map(|(sub, _)| sub.priority) + .min_by_key(|p| p.value()); + + let any_changed = results + .iter() + .any(|o| !matches!(o, SubmitOutcome::Duplicate)); + + if let Some(priority) = best_priority { + if priority.value() <= self.inner.preempt_priority.value() { + self.inner + .active + .preempt_below(priority, &self.inner.store, &self.inner.event_tx) + .await; + } + } + + if any_changed { + self.inner.work_notify.notify_one(); + } + + Ok(results) + } + + /// Submit a [`TypedTask`], handling serialization automatically. + /// + /// Uses the priority from [`TypedTask::priority()`]. + pub async fn submit_typed(&self, task: &T) -> Result { + let sub = TaskSubmission::from_typed(task)?; + self.submit(&sub).await + } + + /// Submit a [`TypedTask`] with an explicit priority override. + /// + /// The provided `priority` replaces whatever [`TypedTask::priority()`] + /// would return, keeping priority out of the serialized payload. + pub async fn submit_typed_at( + &self, + task: &T, + priority: Priority, + ) -> Result { + let mut sub = TaskSubmission::from_typed(task)?; + sub.priority = priority; + self.submit(&sub).await + } + + /// Look up a task by the same inputs used during submission. + /// + /// Computes the dedup key from `task_type` and `dedup_input` (the + /// explicit key string or payload bytes — whichever was used when + /// submitting), then checks the active queue and history in one call. + /// + /// # Examples + /// + /// ```ignore + /// // Using an explicit key (same as TaskSubmission.key = Some("my-file.jpg")) + /// let result = scheduler.task_lookup("thumbnail", Some(b"my-file.jpg")).await?; + /// + /// // Using payload-based dedup (same as TaskSubmission.key = None, payload = ...) + /// let result = scheduler.task_lookup("ingest", Some(&payload_bytes)).await?; + /// ``` + pub async fn task_lookup( + &self, + task_type: &str, + dedup_input: Option<&[u8]>, + ) -> Result { + let key = generate_dedup_key(task_type, dedup_input); + self.inner.store.task_lookup(&key).await + } + + /// Look up a [`TypedTask`] by value, using its serialized form as the + /// dedup input. + /// + /// This mirrors [`submit_typed`](Self::submit_typed) — pass the same + /// struct you would submit and get back its current status. + pub async fn lookup_typed(&self, task: &T) -> Result { + let payload = serde_json::to_vec(task)?; + let key = generate_dedup_key(T::TASK_TYPE, Some(&payload)); + self.inner.store.task_lookup(&key).await + } + + /// Cancel a task by id. + /// + /// If the task is currently running, its cancellation token is triggered + /// and it is removed from the active map. If it is pending or paused, + /// it is deleted from the store. Returns `true` if the task was found + /// and cancelled. + pub async fn cancel(&self, task_id: i64) -> Result { + // Check if it's an active (running) task first. + if let Some(at) = self.inner.active.remove(task_id).await { + at.token.cancel(); + self.inner.store.delete(task_id).await?; + let _ = self.inner.event_tx.send(SchedulerEvent::Cancelled { + task_id, + task_type: at.record.task_type.clone(), + key: at.record.key.clone(), + }); + return Ok(true); + } + + // Not active — try to delete from the queue (pending/paused). + let deleted = self.inner.store.delete(task_id).await?; + Ok(deleted) + } + + /// Try to pop and execute the next task. + /// + /// Returns `true` if a task was dispatched, `false` if no work was available + /// (empty queue, concurrency limit, IO budget exhausted, or throttled). + pub async fn try_dispatch(&self) -> Result { + // Check concurrency limit. + let active_count = self.inner.active.count().await; + let max = self.inner.max_concurrency.load(AtomicOrdering::Relaxed); + if active_count >= max { + return Ok(false); + } + + // Peek at the next candidate without changing its status. + let Some(candidate) = self.inner.store.peek_next().await? else { + return Ok(false); + }; + + // Build gate context from current state. + let reader_guard = self.inner.resource_reader.lock().await; + let gate_ctx = GateContext { + store: &self.inner.store, + resource_reader: reader_guard.as_ref(), + }; + + // Admission check while the task is still pending — no running + // window if the gate rejects. + if !self.inner.gate.admit(&candidate, &gate_ctx).await? { + drop(reader_guard); + return Ok(false); + } + drop(reader_guard); + + // Atomically claim the task. Returns None if another dispatcher + // claimed it (or it was cancelled) between peek and now. + let Some(task) = self.inner.store.pop_by_id(candidate.id).await? else { + return Ok(false); + }; + + // Look up executor. + let Some(executor) = self.inner.registry.get(&task.task_type) else { + tracing::error!( + task_type = task.task_type, + "no executor registered — failing task" + ); + self.inner + .store + .fail( + task.id, + &format!("no executor registered for type '{}'", task.task_type), + false, + 0, + 0, + 0, + ) + .await?; + return Ok(true); + }; + let executor = Arc::clone(executor); + + // Spawn the task — this inserts into the active map, builds the + // context, emits Dispatched, and wires up completion handling. + dispatch::spawn_task( + task, + executor, + self.inner.store.clone(), + self.inner.active.clone(), + self.inner.event_tx.clone(), + self.inner.max_retries, + self.inner.app_state.snapshot().await, + ) + .await; + + Ok(true) + } + + /// Run the scheduler loop until the cancellation token is triggered. + /// + /// This is the main entry point. The loop wakes on three conditions: + /// 1. Cancellation — triggers shutdown. + /// 2. Notification — a task was submitted or the scheduler was resumed. + /// 3. Poll interval — periodic housekeeping (e.g. resuming paused tasks). + /// + /// On mobile targets (iOS/Android), the notify-based wake avoids the + /// constant 500ms polling that would otherwise prevent the CPU from sleeping. + pub async fn run(&self, token: CancellationToken) { + tracing::info!( + max_concurrency = self.inner.max_concurrency.load(AtomicOrdering::Relaxed), + "taskmill scheduler started" + ); + + loop { + tokio::select! { + _ = token.cancelled() => { + tracing::info!("taskmill scheduler shutting down"); + self.shutdown().await; + break; + } + _ = self.inner.work_notify.notified() => { + self.poll_and_dispatch().await; + } + _ = tokio::time::sleep(self.inner.poll_interval) => { + self.poll_and_dispatch().await; + } + } + } + } + + /// Resume paused tasks and dispatch pending work. + async fn poll_and_dispatch(&self) { + if self.is_paused() { + return; + } + + // Resume paused tasks only if no active preemptors exist. + if let Ok(paused) = self.inner.store.paused_tasks().await { + for task in paused { + if !self + .inner + .active + .has_preemptors_for(task.priority, self.inner.preempt_priority) + .await + { + let _ = self.inner.store.resume(task.id).await; + } + } + } + + // Try to dispatch tasks until we can't. + loop { + match self.try_dispatch().await { + Ok(true) => continue, + Ok(false) => break, + Err(e) => { + tracing::error!(error = %e, "scheduler dispatch error"); + break; + } + } + } + } + + /// Perform shutdown according to the configured `ShutdownMode`. + async fn shutdown(&self) { + // Stop the resource sampler. + self.inner.sampler_token.cancel(); + + match self.inner.shutdown_mode { + ShutdownMode::Hard => { + self.inner.active.cancel_all().await; + } + ShutdownMode::Graceful(timeout) => { + tracing::info!( + timeout_ms = timeout.as_millis() as u64, + "graceful shutdown — waiting for running tasks" + ); + + let deadline = tokio::time::Instant::now() + timeout; + loop { + let count = self.inner.active.count().await; + if count == 0 { + tracing::info!("all tasks completed during graceful shutdown"); + break; + } + if tokio::time::Instant::now() >= deadline { + tracing::warn!( + remaining = count, + "graceful shutdown timeout — cancelling remaining tasks" + ); + self.inner.active.cancel_all().await; + break; + } + tokio::time::sleep(Duration::from_millis(50)).await; + } + } + } + + // Flush WAL and close the database. + self.inner.store.close().await; + } + + /// Snapshot of currently active (in-memory) tasks. + pub async fn active_tasks(&self) -> Vec { + self.inner.active.records().await + } + + /// Get estimated progress for all running tasks. + /// + /// Combines executor-reported progress with throughput-based extrapolation + /// using historical average duration for each task type. + pub async fn estimated_progress(&self) -> Vec { + let snapshots: Vec<_> = self.inner.active.progress_snapshots().await; + let mut results = Vec::with_capacity(snapshots.len()); + for (record, reported, reported_at) in snapshots { + results.push( + progress::extrapolate(&record, reported, reported_at, &self.inner.store).await, + ); + } + results + } + + /// Capture a single status snapshot for dashboard UIs. + /// + /// Gathers running tasks, queue depths, progress estimates, and + /// backpressure in one call — exactly what a Tauri command would + /// return to the frontend. + pub async fn snapshot(&self) -> Result { + let running = self.inner.active.records().await; + let pending_count = self.inner.store.pending_count().await?; + let paused_count = self.inner.store.paused_count().await?; + let progress = self.estimated_progress().await; + let pressure = self.inner.gate.pressure().await; + let pressure_breakdown = self.inner.gate.pressure_breakdown().await; + let max_concurrency = self.max_concurrency(); + + Ok(SchedulerSnapshot { + running, + pending_count, + paused_count, + progress, + pressure, + pressure_breakdown, + max_concurrency, + is_paused: self.is_paused(), + }) + } + + /// Update max concurrency at runtime (e.g., from adaptive controller or + /// in response to battery/thermal state). + pub fn set_max_concurrency(&self, limit: usize) { + self.inner + .max_concurrency + .store(limit, AtomicOrdering::Relaxed); + tracing::info!(new_limit = limit, "concurrency limit updated"); + } + + /// Read current max concurrency setting. + pub fn max_concurrency(&self) -> usize { + self.inner.max_concurrency.load(AtomicOrdering::Relaxed) + } + + /// Pause the entire scheduler. + /// + /// Stops the run loop from dispatching new tasks and pauses all + /// currently running tasks (their cancellation tokens are triggered + /// and they are moved back to the `paused` state in the store so + /// they will be re-dispatched on resume). + /// + /// Useful when the app is backgrounded, the laptop goes to sleep, + /// or the user clicks "pause all" in the UI. + pub async fn pause_all(&self) { + self.inner.paused.store(true, AtomicOrdering::Release); + let count = self + .inner + .active + .pause_all(&self.inner.store, &self.inner.event_tx) + .await; + let _ = self.inner.event_tx.send(SchedulerEvent::Paused); + tracing::info!(paused_tasks = count, "scheduler paused"); + } + + /// Resume the scheduler after a [`pause_all`](Self::pause_all). + /// + /// Clears the pause flag so the run loop will resume dispatching on + /// its next poll tick. Tasks that were paused in the store will be + /// picked up automatically. + pub async fn resume_all(&self) { + self.inner.paused.store(false, AtomicOrdering::Release); + self.inner.work_notify.notify_one(); + let _ = self.inner.event_tx.send(SchedulerEvent::Resumed); + tracing::info!("scheduler resumed"); + } + + /// Returns `true` if the scheduler is globally paused. + pub fn is_paused(&self) -> bool { + self.inner.paused.load(AtomicOrdering::Acquire) + } +} + +// ── Builder ───────────────────────────────────────────────────────── + +/// Ergonomic builder for constructing a [`Scheduler`] with all its dependencies. +/// +/// Hides the `Arc>` wiring and manages the resource sampler lifecycle. +/// +/// # Example +/// +/// ```no_run +/// # async fn example() -> Result<(), Box> { +/// use std::sync::Arc; +/// use taskmill::{Scheduler, Priority}; +/// +/// let scheduler = Scheduler::builder() +/// .store_path("tasks.db") +/// // .executor("scan", Arc::new(my_scan_executor)) +/// .max_concurrency(8) +/// .with_resource_monitoring() +/// .build() +/// .await?; +/// # Ok(()) +/// # } +/// ``` +pub struct SchedulerBuilder { + store_path: Option, + store_config: StoreConfig, + store: Option, + executors: Vec<(String, Arc)>, + config: SchedulerConfig, + pressure_sources: Vec>, + policy: Option, + enable_resource_monitoring: bool, + custom_sampler: Option>, + sampler_config: SamplerConfig, + app_state_entries: Vec<(std::any::TypeId, Arc)>, +} + +impl SchedulerBuilder { + pub fn new() -> Self { + Self { + store_path: None, + store_config: StoreConfig::default(), + store: None, + executors: Vec::new(), + config: SchedulerConfig::default(), + pressure_sources: Vec::new(), + policy: None, + enable_resource_monitoring: false, + custom_sampler: None, + sampler_config: SamplerConfig::default(), + app_state_entries: Vec::new(), + } + } + + /// Set the SQLite database path. Either this or [`store`] must be called. + pub fn store_path(mut self, path: &str) -> Self { + self.store_path = Some(path.to_string()); + self + } + + /// Configure the SQLite connection pool. + pub fn store_config(mut self, config: StoreConfig) -> Self { + self.store_config = config; + self + } + + /// Use a pre-opened [`TaskStore`] instead of opening one from a path. + pub fn store(mut self, store: TaskStore) -> Self { + self.store = Some(store); + self + } + + /// Register a task executor for a named type. + pub fn executor(mut self, name: &str, executor: Arc) -> Self { + self.executors.push(( + name.to_string(), + executor as Arc, + )); + self + } + + /// Register an executor using the task type name from a [`TypedTask`]. + /// + /// Equivalent to `.executor(T::TASK_TYPE, executor)`. + pub fn typed_executor(self, executor: Arc) -> Self { + self.executor(T::TASK_TYPE, executor) + } + + /// Set maximum concurrent tasks. Default: 4. + pub fn max_concurrency(mut self, limit: usize) -> Self { + self.config.max_concurrency = limit; + self + } + + /// Set maximum retries before permanent failure. Default: 3. + pub fn max_retries(mut self, retries: i32) -> Self { + self.config.max_retries = retries; + self + } + + /// Set the priority threshold for preemption. Default: REALTIME. + pub fn preempt_priority(mut self, priority: Priority) -> Self { + self.config.preempt_priority = priority; + self + } + + /// Set the poll interval. Default: 500ms. + pub fn poll_interval(mut self, interval: Duration) -> Self { + self.config.poll_interval = interval; + self + } + + /// Set the shutdown mode. Default: Hard. + pub fn shutdown_mode(mut self, mode: ShutdownMode) -> Self { + self.config.shutdown_mode = mode; + self + } + + /// Add a backpressure source (used by the default gate). + pub fn pressure_source( + mut self, + source: Box, + ) -> Self { + self.pressure_sources.push(source); + self + } + + /// Set a custom throttle policy (used by the default gate). Default: three-tier. + pub fn throttle_policy(mut self, policy: ThrottlePolicy) -> Self { + self.policy = Some(policy); + self + } + + /// Enable platform resource monitoring (CPU, disk IO) using `sysinfo`. + /// + /// This starts a background sampler task that feeds IO data to the + /// scheduler for budget-based dispatch decisions. The sampler is + /// automatically stopped when the scheduler shuts down. + pub fn with_resource_monitoring(mut self) -> Self { + self.enable_resource_monitoring = true; + self + } + + /// Provide a custom [`ResourceSampler`] instead of the default platform one. + pub fn resource_sampler(mut self, sampler: Box) -> Self { + self.custom_sampler = Some(sampler); + self.enable_resource_monitoring = true; + self + } + + /// Configure the resource sampler loop. + pub fn sampler_config(mut self, config: SamplerConfig) -> Self { + self.sampler_config = config; + self + } + + /// Register shared application state accessible from every executor via + /// [`TaskContext::state`]. + /// + /// Multiple types can be registered — each is keyed by its concrete + /// `TypeId`. Calling this twice with the same `T` overwrites the + /// previous value. + /// + /// The state is stored as `Arc` internally, so it is shared (not + /// cloned) across all running tasks. This mirrors how Axum, Actix, and + /// Tauri handle shared application state. + /// + /// # Example + /// + /// ```ignore + /// struct AppServices { http: reqwest::Client, db: DatabasePool } + /// + /// let services = AppServices { /* ... */ }; + /// Scheduler::builder() + /// .app_state(services) + /// .build() + /// .await?; + /// ``` + pub fn app_state(self, state: T) -> Self { + self.app_state_arc(Arc::new(state)) + } + + /// Register shared application state from a pre-existing `Arc`. + /// + /// Use this instead of [`app_state`](Self::app_state) when you already + /// have an `Arc` and need to retain a handle for use outside the + /// scheduler (e.g. to populate `OnceLock` fields after build). Avoids + /// double-wrapping (`Arc>`), which would cause + /// [`TaskContext::state`] downcasts to fail. + /// + /// Multiple types can be registered — each is keyed by its concrete + /// `TypeId`. + pub fn app_state_arc(mut self, state: Arc) -> Self { + self.app_state_entries + .push((std::any::TypeId::of::(), state)); + self + } + + /// Build the scheduler. Opens the database and wires all components. + /// + /// If resource monitoring is enabled, the sampler background loop is + /// started and will be stopped automatically when the scheduler shuts + /// down (via the token passed to [`Scheduler::run`]). + pub async fn build(self) -> Result { + // Open or use provided store. + let store = if let Some(store) = self.store { + store + } else if let Some(path) = &self.store_path { + TaskStore::open_with_config(path, self.store_config).await? + } else { + return Err(StoreError::Database( + "SchedulerBuilder requires either store_path() or store()".into(), + )); + }; + + // Build registry. + let mut registry = TaskTypeRegistry::new(); + for (name, executor) in self.executors { + if registry.get(&name).is_some() { + panic!("task type '{name}' already registered"); + } + registry.register_erased(&name, executor); + } + + // Build gate from pressure sources + policy. + let mut pressure = CompositePressure::new(); + for source in self.pressure_sources { + pressure.add_source(source); + } + let policy = self + .policy + .unwrap_or_else(ThrottlePolicy::default_three_tier); + let gate = Box::new(DefaultDispatchGate::new(pressure, policy)); + + let app_state = Arc::new(crate::registry::StateMap::from_entries( + self.app_state_entries, + )); + + let scheduler = + Scheduler::with_gate(store, self.config, Arc::new(registry), gate, app_state); + + // Set up resource monitoring. + if self.enable_resource_monitoring { + #[cfg(feature = "sysinfo-monitor")] + let sampler: Box = self + .custom_sampler + .unwrap_or_else(|| crate::resource::platform_sampler()); + + #[cfg(not(feature = "sysinfo-monitor"))] + let sampler: Box = self + .custom_sampler + .expect("resource monitoring enabled but no custom sampler provided and sysinfo-monitor feature is disabled"); + + let reader = SmoothedReader::new(); + scheduler + .set_resource_reader(Arc::new(reader.clone())) + .await; + + // Spawn sampler loop — it will stop when the scheduler's sampler_token is cancelled. + let sampler_arc = Arc::new(tokio::sync::Mutex::new(sampler)); + let sampler_config = self.sampler_config; + let sampler_token = scheduler.inner.sampler_token.clone(); + tokio::spawn(crate::resource::sampler::run_sampler( + sampler_arc, + reader, + sampler_config, + sampler_token, + )); + } + + Ok(scheduler) + } +} + +impl Default for SchedulerBuilder { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::registry::{TaskContext, TaskExecutor}; + use crate::task::{TaskError, TaskResult}; + + struct InstantExecutor; + + impl TaskExecutor for InstantExecutor { + async fn execute<'a>(&'a self, _ctx: &'a TaskContext) -> Result { + Ok(TaskResult { + actual_read_bytes: 100, + actual_write_bytes: 50, + }) + } + } + + struct SlowExecutor; + + impl TaskExecutor for SlowExecutor { + async fn execute<'a>(&'a self, ctx: &'a TaskContext) -> Result { + tokio::select! { + _ = ctx.token.cancelled() => { + Err(TaskError { + message: "cancelled".into(), + retryable: false, + actual_read_bytes: 0, + actual_write_bytes: 0, + }) + } + _ = tokio::time::sleep(Duration::from_secs(60)) => { + Ok(TaskResult { + actual_read_bytes: 100, + actual_write_bytes: 50, + }) + } + } + } + } + + #[allow(dead_code)] + struct FailingExecutor; + + impl TaskExecutor for FailingExecutor { + async fn execute<'a>(&'a self, _ctx: &'a TaskContext) -> Result { + Err(TaskError { + message: "boom".into(), + retryable: true, + actual_read_bytes: 0, + actual_write_bytes: 0, + }) + } + } + + async fn setup(executor: Arc) -> Scheduler { + let store = TaskStore::open_memory().await.unwrap(); + let mut registry = TaskTypeRegistry::new(); + registry.register_erased("test", executor); + + Scheduler::new( + store, + SchedulerConfig::default(), + Arc::new(registry), + CompositePressure::new(), + ThrottlePolicy::default_three_tier(), + ) + } + + fn arc_erased(e: E) -> Arc { + Arc::new(e) as Arc + } + + #[tokio::test] + async fn dispatch_executes_task() { + let sched = setup(arc_erased(InstantExecutor)).await; + + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("k1".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + let dispatched = sched.try_dispatch().await.unwrap(); + assert!(dispatched); + + // Give spawned task time to complete. + tokio::time::sleep(Duration::from_millis(50)).await; + + // Task should be completed and in history. + let k1 = crate::task::generate_dedup_key("test", Some(b"k1")); + assert!(sched.store().task_by_key(&k1).await.unwrap().is_none()); + let hist = sched.store().history_by_key(&k1).await.unwrap(); + assert_eq!(hist.len(), 1); + } + + #[tokio::test] + async fn dispatch_returns_false_when_empty() { + let sched = setup(arc_erased(InstantExecutor)).await; + let dispatched = sched.try_dispatch().await.unwrap(); + assert!(!dispatched); + } + + #[tokio::test] + async fn unregistered_type_fails_task() { + let store = TaskStore::open_memory().await.unwrap(); + let registry = TaskTypeRegistry::new(); // empty — no executors + + let sched = Scheduler::new( + store, + SchedulerConfig::default(), + Arc::new(registry), + CompositePressure::new(), + ThrottlePolicy::default_three_tier(), + ); + + sched + .submit(&TaskSubmission { + task_type: "unknown".into(), + key: Some("k".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(50)).await; + + let failed = sched.store().failed_tasks(10).await.unwrap(); + assert_eq!(failed.len(), 1); + } + + #[tokio::test] + async fn dedup_via_scheduler() { + let sched = setup(arc_erased(InstantExecutor)).await; + + let sub = TaskSubmission { + task_type: "test".into(), + key: Some("dup".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }; + + let first = sched.submit(&sub).await.unwrap(); + let second = sched.submit(&sub).await.unwrap(); + assert!(first.is_inserted()); + assert_eq!(second, SubmitOutcome::Duplicate); + } + + #[tokio::test] + async fn set_max_concurrency_works() { + let sched = setup(arc_erased(InstantExecutor)).await; + assert_eq!(sched.max_concurrency(), 4); + sched.set_max_concurrency(8); + assert_eq!(sched.max_concurrency(), 8); + } + + #[tokio::test] + async fn cancel_pending_task() { + let sched = setup(arc_erased(InstantExecutor)).await; + + let id = sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("cancel-me".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap() + .id() + .unwrap(); + + let cancelled = sched.cancel(id).await.unwrap(); + assert!(cancelled); + + // Task should be gone. + let cancel_key = crate::task::generate_dedup_key("test", Some(b"cancel-me")); + assert!(sched + .store() + .task_by_key(&cancel_key) + .await + .unwrap() + .is_none()); + } + + #[tokio::test] + async fn cancel_running_task() { + let sched = setup(arc_erased(SlowExecutor)).await; + + let id = sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("cancel-running".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap() + .id() + .unwrap(); + + // Dispatch it so it's running. + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(10)).await; + + let cancelled = sched.cancel(id).await.unwrap(); + assert!(cancelled); + } + + #[tokio::test] + async fn event_emitted_on_complete() { + let sched = setup(arc_erased(InstantExecutor)).await; + let mut rx = sched.subscribe(); + + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("evt".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + sched.try_dispatch().await.unwrap(); + + // Should get Dispatched event. + let evt = rx.recv().await.unwrap(); + assert!(matches!(evt, SchedulerEvent::Dispatched { .. })); + + // Wait for completion. + tokio::time::sleep(Duration::from_millis(50)).await; + + let evt = rx.recv().await.unwrap(); + assert!(matches!(evt, SchedulerEvent::Completed { .. })); + } + + #[tokio::test] + async fn scheduler_is_clone() { + let sched = setup(arc_erased(InstantExecutor)).await; + let sched2 = sched.clone(); + + // Both should share the same store. + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("shared".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + // The clone can see the task. + let shared_key = crate::task::generate_dedup_key("test", Some(b"shared")); + let task = sched2.store().task_by_key(&shared_key).await.unwrap(); + assert!(task.is_some()); + } + + #[tokio::test] + async fn submit_typed_enqueues_task() { + use serde::{Deserialize as De, Serialize as Ser}; + + #[derive(Ser, De, Debug, PartialEq)] + struct Thumb { + path: String, + } + + impl crate::task::TypedTask for Thumb { + const TASK_TYPE: &'static str = "test"; + + fn expected_read_bytes(&self) -> i64 { + 4096 + } + + fn expected_write_bytes(&self) -> i64 { + 512 + } + } + + let sched = setup(arc_erased(InstantExecutor)).await; + + let task = Thumb { + path: "/a.jpg".into(), + }; + let outcome = sched.submit_typed(&task).await.unwrap(); + assert!(outcome.is_inserted()); + + // Verify the stored record has correct metadata. + let record = sched + .store() + .task_by_id(outcome.id().unwrap()) + .await + .unwrap() + .expect("task should exist"); + assert_eq!(record.task_type, "test"); + assert_eq!(record.expected_read_bytes, 4096); + assert_eq!(record.expected_write_bytes, 512); + + // Payload round-trips. + let recovered: Thumb = record.deserialize_payload().unwrap().unwrap(); + assert_eq!(recovered, task); + } + + #[tokio::test] + async fn snapshot_returns_dashboard_state() { + let sched = setup(arc_erased(SlowExecutor)).await; + + // Submit two tasks. + for key in &["snap-a", "snap-b"] { + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some(key.to_string()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + } + + // Dispatch one so it becomes running. + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(10)).await; + + let snap = sched.snapshot().await.unwrap(); + + assert_eq!(snap.running.len(), 1); + assert_eq!(snap.pending_count, 1); + assert_eq!(snap.paused_count, 0); + assert_eq!(snap.progress.len(), 1); + assert_eq!(snap.pressure, 0.0); // no pressure sources + assert!(snap.pressure_breakdown.is_empty()); + assert_eq!(snap.max_concurrency, 4); + } + + #[tokio::test] + async fn pause_all_stops_dispatching() { + let sched = setup(arc_erased(SlowExecutor)).await; + + // Submit two tasks. + for key in &["pa-1", "pa-2"] { + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some(key.to_string()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + } + + // Dispatch one so it's running. + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(10)).await; + assert_eq!(sched.active_tasks().await.len(), 1); + + // Pause — running task should be cancelled and moved to paused in store. + sched.pause_all().await; + assert!(sched.is_paused()); + assert_eq!(sched.active_tasks().await.len(), 0); + + // try_dispatch should still work at the store level (it doesn't check + // the pause flag itself — the run loop does), but we can verify that + // the snapshot shows is_paused. + let snap = sched.snapshot().await.unwrap(); + assert!(snap.is_paused); + + // Resume — flag should clear. + sched.resume_all().await; + assert!(!sched.is_paused()); + let snap = sched.snapshot().await.unwrap(); + assert!(!snap.is_paused); + } + + #[tokio::test] + async fn pause_resume_events_emitted() { + let sched = setup(arc_erased(InstantExecutor)).await; + let mut rx = sched.subscribe(); + + sched.pause_all().await; + let evt = rx.recv().await.unwrap(); + assert!(matches!(evt, SchedulerEvent::Paused)); + + sched.resume_all().await; + let evt = rx.recv().await.unwrap(); + assert!(matches!(evt, SchedulerEvent::Resumed)); + } + + #[tokio::test] + async fn app_state_accessible_from_executor() { + use std::sync::atomic::{AtomicBool, Ordering}; + + struct MyState { + flag: Arc, + } + + struct StateCheckExecutor; + + impl TaskExecutor for StateCheckExecutor { + async fn execute<'a>(&'a self, ctx: &'a TaskContext) -> Result { + let state = ctx.state::().expect("state should be set"); + state.flag.store(true, Ordering::SeqCst); + Ok(TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }) + } + } + + let flag = Arc::new(AtomicBool::new(false)); + + let sched = Scheduler::builder() + .store(TaskStore::open_memory().await.unwrap()) + .executor("test", Arc::new(StateCheckExecutor)) + .app_state(MyState { flag: flag.clone() }) + .build() + .await + .unwrap(); + + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("state-test".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(50)).await; + + assert!(flag.load(Ordering::SeqCst)); + } + + #[tokio::test] + async fn task_lookup_pending() { + let sched = setup(arc_erased(InstantExecutor)).await; + + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("lookup-1".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + let result = sched.task_lookup("test", Some(b"lookup-1")).await.unwrap(); + assert!(matches!( + result, + crate::task::TaskLookup::Active(ref r) if r.status == crate::task::TaskStatus::Pending + )); + } + + #[tokio::test] + async fn task_lookup_completed() { + let sched = setup(arc_erased(InstantExecutor)).await; + + sched + .submit(&TaskSubmission { + task_type: "test".into(), + key: Some("lookup-done".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }) + .await + .unwrap(); + + sched.try_dispatch().await.unwrap(); + tokio::time::sleep(Duration::from_millis(50)).await; + + let result = sched + .task_lookup("test", Some(b"lookup-done")) + .await + .unwrap(); + assert!(matches!(result, crate::task::TaskLookup::History(_))); + } + + #[tokio::test] + async fn task_lookup_not_found() { + let sched = setup(arc_erased(InstantExecutor)).await; + let result = sched + .task_lookup("test", Some(b"does-not-exist")) + .await + .unwrap(); + assert!(matches!(result, crate::task::TaskLookup::NotFound)); + } + + #[tokio::test] + async fn lookup_typed_works() { + use serde::{Deserialize as De, Serialize as Ser}; + + #[derive(Ser, De, Debug, PartialEq)] + struct Thumb { + path: String, + } + + impl crate::task::TypedTask for Thumb { + const TASK_TYPE: &'static str = "test"; + } + + let sched = setup(arc_erased(InstantExecutor)).await; + + let task = Thumb { + path: "/a.jpg".into(), + }; + sched.submit_typed(&task).await.unwrap(); + + let result = sched.lookup_typed(&task).await.unwrap(); + assert!(matches!(result, crate::task::TaskLookup::Active(_))); + } +} diff --git a/src/scheduler/progress.rs b/src/scheduler/progress.rs new file mode 100644 index 0000000..bee4b13 --- /dev/null +++ b/src/scheduler/progress.rs @@ -0,0 +1,132 @@ +use serde::{Deserialize, Serialize}; + +use crate::store::TaskStore; +use crate::task::TaskRecord; + +use super::SchedulerEvent; + +// ── Progress Reporter ────────────────────────────────────────────── + +/// Handle passed to executors for reporting progress back to the scheduler. +/// +/// Progress reports are emitted as `SchedulerEvent::Progress` events, +/// making them available to the UI via the same broadcast channel. +#[derive(Clone)] +pub struct ProgressReporter { + task_id: i64, + task_type: String, + key: String, + event_tx: tokio::sync::broadcast::Sender, +} + +impl ProgressReporter { + pub(crate) fn new( + task_id: i64, + task_type: String, + key: String, + event_tx: tokio::sync::broadcast::Sender, + ) -> Self { + Self { + task_id, + task_type, + key, + event_tx, + } + } + + /// Report progress as a percentage (0.0 to 1.0) with an optional message. + pub fn report(&self, percent: f32, message: Option) { + let _ = self.event_tx.send(SchedulerEvent::Progress { + task_id: self.task_id, + task_type: self.task_type.clone(), + key: self.key.clone(), + percent: percent.clamp(0.0, 1.0), + message, + }); + } + + /// Report progress as a fraction (completed / total) with an optional message. + pub fn report_fraction(&self, completed: u64, total: u64, message: Option) { + let percent = if total == 0 { + 1.0 + } else { + completed as f32 / total as f32 + }; + self.report(percent, message); + } +} + +// ── Estimated Progress ───────────────────────────────────────────── + +/// Estimated progress for a running task, combining executor-reported progress +/// with throughput-based extrapolation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EstimatedProgress { + pub task_id: i64, + pub task_type: String, + pub key: String, + /// Executor-reported progress (0.0 to 1.0), if available. + pub reported_percent: Option, + /// Throughput-extrapolated progress (0.0 to 1.0), if history data exists. + pub extrapolated_percent: Option, + /// Best available progress estimate. + pub percent: f32, +} + +/// Extrapolate progress for a single active task using historical throughput. +/// +/// Blends executor-reported progress with time-based extrapolation from +/// `store.history_stats()`. This is a pure query — no side effects. +pub(crate) async fn extrapolate( + record: &TaskRecord, + reported_progress: Option, + reported_at: Option>, + store: &TaskStore, +) -> EstimatedProgress { + let reported = reported_progress; + + let extrapolated = if let Some(started) = record.started_at { + let now = chrono::Utc::now(); + if let Ok(stats) = store.history_stats(&record.task_type).await { + if stats.avg_duration_ms > 0.0 { + // Historical throughput: fraction of work completed per ms. + let hist_throughput = 1.0 / stats.avg_duration_ms; + + match (reported, reported_at) { + // We have a progress anchor — blend throughputs and + // extrapolate from the last report. + (Some(rp), Some(rat)) => { + let elapsed_to_report = (rat - started).num_milliseconds().max(1) as f64; + let current_throughput = rp as f64 / elapsed_to_report; + let blended = (hist_throughput + current_throughput) / 2.0; + let since_report = (now - rat).num_milliseconds().max(0) as f64; + Some((rp as f64 + blended * since_report).min(0.99) as f32) + } + // No report yet — pure time-based extrapolation. + _ => { + let elapsed_ms = (now - started).num_milliseconds() as f64; + Some((elapsed_ms * hist_throughput).min(0.99) as f32) + } + } + } else { + None + } + } else { + None + } + } else { + None + }; + + // Best estimate: prefer reported, fall back to extrapolated, then 0. + let percent = reported.or(extrapolated).unwrap_or(0.0); + + EstimatedProgress { + task_id: record.id, + task_type: record.task_type.clone(), + key: record.key.clone(), + reported_percent: reported, + extrapolated_percent: extrapolated, + percent, + } +} diff --git a/src/store.rs b/src/store.rs new file mode 100644 index 0000000..12dd188 --- /dev/null +++ b/src/store.rs @@ -0,0 +1,1732 @@ +use std::sync::atomic::{AtomicU64, Ordering}; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::sqlite::{SqliteConnectOptions, SqliteJournalMode, SqlitePoolOptions, SqliteSynchronous}; +use sqlx::{Row, SqlitePool}; + +use crate::priority::Priority; +use crate::task::{ + HistoryStatus, SubmitOutcome, TaskHistoryRecord, TaskLookup, TaskRecord, TaskResult, + TaskStatus, TaskSubmission, TypeStats, MAX_PAYLOAD_BYTES, +}; + +/// Serde-friendly error type for Tauri IPC and API boundaries. +/// +/// Wraps the internal `sqlx::Error` into a serializable form so that +/// callers do not need manual conversion at every call site. +#[derive(Debug, Clone, Serialize, Deserialize, thiserror::Error)] +pub enum StoreError { + #[error("payload exceeds maximum size of {MAX_PAYLOAD_BYTES} bytes")] + PayloadTooLarge, + #[error("serialization error: {0}")] + Serialization(String), + #[error("database error: {0}")] + Database(String), +} + +impl From for StoreError { + fn from(e: sqlx::Error) -> Self { + StoreError::Database(e.to_string()) + } +} + +impl From for StoreError { + fn from(e: serde_json::Error) -> Self { + StoreError::Serialization(e.to_string()) + } +} + +/// History retention policy for automatic pruning of old records. +/// +/// Applied during `complete()` and `fail()` to keep the `task_history` +/// table bounded. Set to `None` to disable auto-pruning. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum RetentionPolicy { + /// Keep at most this many history records (oldest pruned first). + MaxCount(i64), + /// Keep records from the last N days. + MaxAgeDays(i64), +} + +/// Configuration for the SQLite connection pool. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StoreConfig { + /// Maximum number of connections in the pool. + /// + /// Higher values reduce contention when multiple Tauri commands and + /// background tasks access the store concurrently. Setting this too + /// high on a single SQLite file provides diminishing returns since + /// SQLite serializes writes. + /// + /// Default: 16. + pub max_connections: u32, + + /// Optional retention policy for automatic history pruning. + /// + /// When set, completed/failed tasks are pruned during `complete()` and + /// `fail()` to keep the history table bounded. + pub retention_policy: Option, + + /// How many completions between automatic prune runs. + /// + /// Pruning runs once every `prune_interval` calls to `complete()` or + /// `fail()` instead of on every call. Default: 100. + pub prune_interval: u64, +} + +impl Default for StoreConfig { + fn default() -> Self { + Self { + max_connections: 16, + retention_policy: None, + prune_interval: 100, + } + } +} + +/// SQLite-backed persistence layer for the task queue and history. +#[derive(Clone)] +pub struct TaskStore { + pool: SqlitePool, + retention_policy: Option, + prune_interval: u64, + completion_count: std::sync::Arc, +} + +impl TaskStore { + /// Open (or create) a taskmill database at the given path with default config. + pub async fn open(path: &str) -> Result { + Self::open_with_config(path, StoreConfig::default()).await + } + + /// Open (or create) a taskmill database at the given path with custom config. + pub async fn open_with_config(path: &str, config: StoreConfig) -> Result { + let opts = SqliteConnectOptions::new() + .filename(path) + .create_if_missing(true) + .journal_mode(SqliteJournalMode::Wal) + .synchronous(SqliteSynchronous::Normal) + .busy_timeout(std::time::Duration::from_secs(5)); + + let pool = SqlitePoolOptions::new() + .max_connections(config.max_connections) + .connect_with(opts) + .await?; + + let store = Self { + pool, + retention_policy: config.retention_policy, + prune_interval: config.prune_interval, + completion_count: std::sync::Arc::new(AtomicU64::new(0)), + }; + store.migrate().await?; + store.recover_running().await?; + Ok(store) + } + + /// Open an in-memory database (for testing). + pub async fn open_memory() -> Result { + let opts = SqliteConnectOptions::new() + .filename(":memory:") + .journal_mode(SqliteJournalMode::Wal) + .synchronous(SqliteSynchronous::Normal) + .busy_timeout(std::time::Duration::from_secs(5)); + + let pool = SqlitePoolOptions::new() + .max_connections(1) + .connect_with(opts) + .await?; + + let store = Self { + pool, + retention_policy: None, + prune_interval: 100, + completion_count: std::sync::Arc::new(AtomicU64::new(0)), + }; + store.migrate().await?; + Ok(store) + } + + /// Run the migration SQL. + async fn migrate(&self) -> Result<(), StoreError> { + sqlx::raw_sql(include_str!("../migrations/001_tasks.sql")) + .execute(&self.pool) + .await?; + Ok(()) + } + + /// Restart recovery: reset any `running` tasks back to `pending`. + async fn recover_running(&self) -> Result<(), StoreError> { + let result = sqlx::query( + "UPDATE tasks SET status = 'pending', started_at = NULL WHERE status = 'running'", + ) + .execute(&self.pool) + .await?; + let count = result.rows_affected(); + if count > 0 { + tracing::info!(count, "recovered interrupted tasks back to pending"); + } + Ok(()) + } + + /// Get a reference to the underlying connection pool. + pub fn pool(&self) -> &SqlitePool { + &self.pool + } + + /// Begin an IMMEDIATE transaction for write operations. + /// + /// Unlike `pool.begin()` which uses `BEGIN DEFERRED`, this acquires the + /// write lock upfront. This prevents deadlocks when multiple transactions + /// read-then-write concurrently — the busy_timeout is properly honored + /// instead of SQLite returning SQLITE_BUSY immediately. + /// + /// The returned connection auto-rollbacks on drop (sqlx resets pooled + /// connections with open transactions). + async fn begin_write(&self) -> Result, StoreError> { + let mut conn = self.pool.acquire().await?; + sqlx::query("BEGIN IMMEDIATE").execute(&mut *conn).await?; + Ok(conn) + } + + // ── Submit ────────────────────────────────────────────────────── + + /// Submit a new task. + /// + /// Returns [`SubmitOutcome::Inserted`] if the task was enqueued, + /// [`SubmitOutcome::Upgraded`] if a duplicate existed but its priority + /// was upgraded, or [`SubmitOutcome::Duplicate`] if a duplicate existed + /// with equal or higher priority. + /// + /// When `sub.key` is `None`, the dedup key is auto-generated by hashing + /// the task type and payload. + pub async fn submit(&self, sub: &TaskSubmission) -> Result { + if let Some(ref p) = sub.payload { + if p.len() > MAX_PAYLOAD_BYTES { + return Err(StoreError::PayloadTooLarge); + } + } + + let key = sub.effective_key(); + let priority = sub.priority.value() as i32; + tracing::debug!(task_type = %sub.task_type, "store.submit: INSERT start"); + let result = sqlx::query( + "INSERT OR IGNORE INTO tasks (task_type, key, priority, payload, expected_read_bytes, expected_write_bytes) + VALUES (?, ?, ?, ?, ?, ?)", + ) + .bind(&sub.task_type) + .bind(&key) + .bind(priority) + .bind(&sub.payload) + .bind(sub.expected_read_bytes) + .bind(sub.expected_write_bytes) + .execute(&self.pool) + .await?; + tracing::debug!(task_type = %sub.task_type, "store.submit: INSERT end"); + + if result.rows_affected() > 0 { + return Ok(SubmitOutcome::Inserted(result.last_insert_rowid())); + } + + // Dedup hit — try to upgrade priority on pending/paused tasks. + // Lower numeric value = higher priority, so `priority > ?` means + // the existing task has lower importance than the new submission. + let row = sqlx::query( + "UPDATE tasks SET priority = ? + WHERE key = ? AND status IN ('pending', 'paused') AND priority > ? + RETURNING id", + ) + .bind(priority) + .bind(&key) + .bind(priority) + .fetch_optional(&self.pool) + .await?; + + if let Some(r) = row { + return Ok(SubmitOutcome::Upgraded(r.get("id"))); + } + + // Dedup hit on running/paused task — mark for re-queue so the task + // runs again after the current execution completes. + let row = sqlx::query( + "UPDATE tasks SET requeue = 1, requeue_priority = ? + WHERE key = ? AND status IN ('running', 'paused') + AND (requeue = 0 OR requeue_priority > ?) + RETURNING id", + ) + .bind(priority) + .bind(&key) + .bind(priority) + .fetch_optional(&self.pool) + .await?; + + match row { + Some(r) => Ok(SubmitOutcome::Requeued(r.get("id"))), + None => Ok(SubmitOutcome::Duplicate), + } + } + + /// Submit multiple tasks in a single transaction. Returns a `Vec` with one + /// [`SubmitOutcome`] per input. + /// + /// This is significantly faster than calling [`submit`](Self::submit) in a + /// loop because all inserts share a single SQLite transaction (one + /// `BEGIN`/`COMMIT` pair instead of N implicit transactions). + pub async fn submit_batch( + &self, + submissions: &[TaskSubmission], + ) -> Result, StoreError> { + // Pre-validate all payloads before starting the transaction + // to avoid partial inserts on validation errors. + for sub in submissions { + if let Some(ref p) = sub.payload { + if p.len() > MAX_PAYLOAD_BYTES { + return Err(StoreError::PayloadTooLarge); + } + } + } + + let mut results = Vec::with_capacity(submissions.len()); + + let mut conn = self.begin_write().await?; + + for sub in submissions { + let key = sub.effective_key(); + let priority = sub.priority.value() as i32; + let result = sqlx::query( + "INSERT OR IGNORE INTO tasks (task_type, key, priority, payload, expected_read_bytes, expected_write_bytes) + VALUES (?, ?, ?, ?, ?, ?)", + ) + .bind(&sub.task_type) + .bind(&key) + .bind(priority) + .bind(&sub.payload) + .bind(sub.expected_read_bytes) + .bind(sub.expected_write_bytes) + .execute(&mut *conn) + .await?; + + if result.rows_affected() > 0 { + results.push(SubmitOutcome::Inserted(result.last_insert_rowid())); + } else { + // Dedup hit — try to upgrade priority on pending/paused tasks. + let row = sqlx::query( + "UPDATE tasks SET priority = ? + WHERE key = ? AND status IN ('pending', 'paused') AND priority > ? + RETURNING id", + ) + .bind(priority) + .bind(&key) + .bind(priority) + .fetch_optional(&mut *conn) + .await?; + + if let Some(r) = row { + results.push(SubmitOutcome::Upgraded(r.get("id"))); + } else { + // Try requeue on running/paused tasks. + let row = sqlx::query( + "UPDATE tasks SET requeue = 1, requeue_priority = ? + WHERE key = ? AND status IN ('running', 'paused') + AND (requeue = 0 OR requeue_priority > ?) + RETURNING id", + ) + .bind(priority) + .bind(&key) + .bind(priority) + .fetch_optional(&mut *conn) + .await?; + + match row { + Some(r) => results.push(SubmitOutcome::Requeued(r.get("id"))), + None => results.push(SubmitOutcome::Duplicate), + } + } + } + } + + sqlx::query("COMMIT").execute(&mut *conn).await?; + Ok(results) + } + + // ── Pop / lifecycle ───────────────────────────────────────────── + + /// Peek at the highest-priority pending task without modifying it. + /// Returns `None` if the queue is empty. + pub async fn peek_next(&self) -> Result, StoreError> { + let row = sqlx::query( + "SELECT * FROM tasks + WHERE status = 'pending' + ORDER BY priority ASC, id ASC + LIMIT 1", + ) + .fetch_optional(&self.pool) + .await?; + + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Atomically claim a specific pending task by id, setting it to running. + /// Returns `None` if the task is no longer pending (e.g. claimed by another + /// dispatcher or cancelled). + pub async fn pop_by_id(&self, id: i64) -> Result, StoreError> { + tracing::debug!(task_id = id, "store.pop_by_id: UPDATE start"); + let row = sqlx::query( + "UPDATE tasks SET status = 'running', started_at = datetime('now') + WHERE id = ? AND status = 'pending' + RETURNING *", + ) + .bind(id) + .fetch_optional(&self.pool) + .await?; + tracing::debug!(task_id = id, "store.pop_by_id: UPDATE end"); + + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Pop the highest-priority pending task and mark it as running. + /// Returns `None` if the queue is empty. + pub async fn pop_next(&self) -> Result, StoreError> { + // Single atomic statement: find + update + return. + let row = sqlx::query( + "UPDATE tasks SET status = 'running', started_at = datetime('now') + WHERE id = ( + SELECT id FROM tasks + WHERE status = 'pending' + ORDER BY priority ASC, id ASC + LIMIT 1 + ) + RETURNING *", + ) + .fetch_optional(&self.pool) + .await?; + + Ok(row.map(|r| row_to_task_record(&r))) + } + + /// Atomically requeue a running task back to pending. + /// + /// Used when a task is popped but then rejected by backpressure or IO + /// budget checks. Unlike pause+resume, this is a single atomic operation + /// that never puts the task in an intermediate state visible to queries. + pub async fn requeue(&self, id: i64) -> Result<(), StoreError> { + sqlx::query( + "UPDATE tasks SET status = 'pending', started_at = NULL WHERE id = ? AND status = 'running'", + ) + .bind(id) + .execute(&self.pool) + .await?; + Ok(()) + } + + /// Mark a task as completed and move it to history. + pub async fn complete(&self, id: i64, result: &TaskResult) -> Result<(), StoreError> { + tracing::debug!(task_id = id, "store.complete: BEGIN tx"); + let mut conn = self.begin_write().await?; + + // Fetch the task to move. + let row = sqlx::query("SELECT * FROM tasks WHERE id = ?") + .bind(id) + .fetch_optional(&mut *conn) + .await?; + + let Some(row) = row else { return Ok(()) }; + let task = row_to_task_record(&row); + + // Compute duration. + let duration_ms: Option = if task.started_at.is_some() { + sqlx::query_scalar( + "SELECT CAST((julianday('now') - julianday(?)) * 86400000 AS INTEGER)", + ) + .bind( + task.started_at + .map(|dt| dt.format("%Y-%m-%d %H:%M:%S").to_string()), + ) + .fetch_one(&mut *conn) + .await? + } else { + None + }; + + // Insert into history. + sqlx::query( + "INSERT INTO task_history (task_type, key, priority, status, payload, + expected_read_bytes, expected_write_bytes, actual_read_bytes, actual_write_bytes, + retry_count, last_error, created_at, started_at, duration_ms) + VALUES (?, ?, ?, 'completed', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(&task.task_type) + .bind(&task.key) + .bind(task.priority.value() as i32) + .bind(&task.payload) + .bind(task.expected_read_bytes) + .bind(task.expected_write_bytes) + .bind(result.actual_read_bytes) + .bind(result.actual_write_bytes) + .bind(task.retry_count) + .bind(&task.last_error) + .bind(task.created_at.format("%Y-%m-%d %H:%M:%S").to_string()) + .bind( + task.started_at + .map(|dt| dt.format("%Y-%m-%d %H:%M:%S").to_string()), + ) + .bind(duration_ms) + .execute(&mut *conn) + .await?; + + if task.requeue { + // Requeue flag set — reset to pending with requeue_priority + // instead of removing from the active queue. + let requeue_priority = task + .requeue_priority + .map(|p| p.value() as i32) + .unwrap_or(task.priority.value() as i32); + sqlx::query( + "UPDATE tasks SET status = 'pending', priority = ?, + started_at = NULL, retry_count = 0, last_error = NULL, + requeue = 0, requeue_priority = NULL + WHERE id = ?", + ) + .bind(requeue_priority) + .bind(id) + .execute(&mut *conn) + .await?; + } else { + // Remove from active queue. + sqlx::query("DELETE FROM tasks WHERE id = ?") + .bind(id) + .execute(&mut *conn) + .await?; + } + + sqlx::query("COMMIT").execute(&mut *conn).await?; + tracing::debug!(task_id = id, "store.complete: COMMIT ok"); + + self.maybe_prune().await; + + Ok(()) + } + + /// Mark a task as failed. If `retryable` and under max retries, requeue + /// it as pending with the same priority. Otherwise move to history as failed. + pub async fn fail( + &self, + id: i64, + error: &str, + retryable: bool, + max_retries: i32, + actual_read_bytes: i64, + actual_write_bytes: i64, + ) -> Result<(), StoreError> { + tracing::debug!(task_id = id, "store.fail: BEGIN tx"); + let mut conn = self.begin_write().await?; + tracing::debug!(task_id = id, "store.fail: BEGIN acquired"); + + let row = sqlx::query("SELECT * FROM tasks WHERE id = ?") + .bind(id) + .fetch_optional(&mut *conn) + .await?; + + let Some(row) = row else { return Ok(()) }; + let task = row_to_task_record(&row); + + if retryable && task.retry_count < max_retries { + // Requeue with incremented retry count, same priority. + sqlx::query( + "UPDATE tasks SET status = 'pending', started_at = NULL, + retry_count = retry_count + 1, last_error = ? + WHERE id = ?", + ) + .bind(error) + .bind(id) + .execute(&mut *conn) + .await?; + } else { + // Permanent failure — move to history. + let duration_ms: Option = if task.started_at.is_some() { + sqlx::query_scalar( + "SELECT CAST((julianday('now') - julianday(?)) * 86400000 AS INTEGER)", + ) + .bind( + task.started_at + .map(|dt| dt.format("%Y-%m-%d %H:%M:%S").to_string()), + ) + .fetch_one(&mut *conn) + .await? + } else { + None + }; + + sqlx::query( + "INSERT INTO task_history (task_type, key, priority, status, payload, + expected_read_bytes, expected_write_bytes, actual_read_bytes, actual_write_bytes, + retry_count, last_error, created_at, started_at, duration_ms) + VALUES (?, ?, ?, 'failed', ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ) + .bind(&task.task_type) + .bind(&task.key) + .bind(task.priority.value() as i32) + .bind(&task.payload) + .bind(task.expected_read_bytes) + .bind(task.expected_write_bytes) + .bind(actual_read_bytes) + .bind(actual_write_bytes) + .bind(task.retry_count + 1) + .bind(error) + .bind(task.created_at.format("%Y-%m-%d %H:%M:%S").to_string()) + .bind(task.started_at.map(|dt| dt.format("%Y-%m-%d %H:%M:%S").to_string())) + .bind(duration_ms) + .execute(&mut *conn) + .await?; + + sqlx::query("DELETE FROM tasks WHERE id = ?") + .bind(id) + .execute(&mut *conn) + .await?; + } + + sqlx::query("COMMIT").execute(&mut *conn).await?; + tracing::debug!(task_id = id, "store.fail: COMMIT ok"); + + self.maybe_prune().await; + + Ok(()) + } + + /// Pause a running task (for preemption). Sets status to paused. + pub async fn pause(&self, id: i64) -> Result<(), StoreError> { + sqlx::query("UPDATE tasks SET status = 'paused', started_at = NULL WHERE id = ?") + .bind(id) + .execute(&self.pool) + .await?; + Ok(()) + } + + /// Resume a paused task back to pending. + pub async fn resume(&self, id: i64) -> Result<(), StoreError> { + sqlx::query("UPDATE tasks SET status = 'pending' WHERE id = ? AND status = 'paused'") + .bind(id) + .execute(&self.pool) + .await?; + Ok(()) + } + + // ── Query: active queue ───────────────────────────────────────── + + /// All currently running tasks. + pub async fn running_tasks(&self) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM tasks WHERE status = 'running' ORDER BY priority ASC, id ASC", + ) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_task_record).collect()) + } + + /// Count of running tasks. + pub async fn running_count(&self) -> Result { + let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM tasks WHERE status = 'running'") + .fetch_one(&self.pool) + .await?; + Ok(count.0) + } + + /// Pending tasks, ordered by priority then age. Limit controls page size. + pub async fn pending_tasks(&self, limit: i32) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM tasks WHERE status = 'pending' ORDER BY priority ASC, id ASC LIMIT ?", + ) + .bind(limit) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_task_record).collect()) + } + + /// Count of pending tasks. + pub async fn pending_count(&self) -> Result { + let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM tasks WHERE status = 'pending'") + .fetch_one(&self.pool) + .await?; + Ok(count.0) + } + + /// Pending tasks filtered by type. + pub async fn pending_by_type(&self, task_type: &str) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM tasks WHERE status = 'pending' AND task_type = ? ORDER BY priority ASC, id ASC", + ) + .bind(task_type) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_task_record).collect()) + } + + /// Count of paused tasks. + pub async fn paused_count(&self) -> Result { + let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM tasks WHERE status = 'paused'") + .fetch_one(&self.pool) + .await?; + Ok(count.0) + } + + /// Paused tasks. + pub async fn paused_tasks(&self) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM tasks WHERE status = 'paused' ORDER BY priority ASC, id ASC", + ) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_task_record).collect()) + } + + /// Look up an active task by its row id. Returns `None` if no active + /// task with that id exists. + pub async fn task_by_id(&self, id: i64) -> Result, StoreError> { + let row = sqlx::query("SELECT * FROM tasks WHERE id = ?") + .bind(id) + .fetch_optional(&self.pool) + .await?; + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Look up an active task by its dedup key. Returns `None` if no active + /// task with that key exists. + pub async fn task_by_key(&self, key: &str) -> Result, StoreError> { + let row = sqlx::query("SELECT * FROM tasks WHERE key = ?") + .bind(key) + .fetch_optional(&self.pool) + .await?; + Ok(row.as_ref().map(row_to_task_record)) + } + + /// Sum of expected read/write bytes for all running tasks. + pub async fn running_io_totals(&self) -> Result<(i64, i64), StoreError> { + let row: (i64, i64) = sqlx::query_as( + "SELECT COALESCE(SUM(expected_read_bytes), 0), COALESCE(SUM(expected_write_bytes), 0) + FROM tasks WHERE status = 'running'", + ) + .fetch_one(&self.pool) + .await?; + Ok(row) + } + + // ── Query: history ────────────────────────────────────────────── + + /// Look up a history record by its row id. + pub async fn history_by_id(&self, id: i64) -> Result, StoreError> { + let row = sqlx::query("SELECT * FROM task_history WHERE id = ?") + .bind(id) + .fetch_optional(&self.pool) + .await?; + Ok(row.as_ref().map(row_to_history_record)) + } + + /// Recent history entries, newest first. + pub async fn history( + &self, + limit: i32, + offset: i32, + ) -> Result, StoreError> { + let rows = + sqlx::query("SELECT * FROM task_history ORDER BY completed_at DESC LIMIT ? OFFSET ?") + .bind(limit) + .bind(offset) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_history_record).collect()) + } + + /// History filtered by task type. + pub async fn history_by_type( + &self, + task_type: &str, + limit: i32, + ) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM task_history WHERE task_type = ? ORDER BY completed_at DESC LIMIT ?", + ) + .bind(task_type) + .bind(limit) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_history_record).collect()) + } + + /// History for a specific key (all past runs of that key). + pub async fn history_by_key(&self, key: &str) -> Result, StoreError> { + let rows = + sqlx::query("SELECT * FROM task_history WHERE key = ? ORDER BY completed_at DESC") + .bind(key) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_history_record).collect()) + } + + /// Failed tasks from history. + pub async fn failed_tasks(&self, limit: i32) -> Result, StoreError> { + let rows = sqlx::query( + "SELECT * FROM task_history WHERE status = 'failed' ORDER BY completed_at DESC LIMIT ?", + ) + .bind(limit) + .fetch_all(&self.pool) + .await?; + Ok(rows.iter().map(row_to_history_record).collect()) + } + + /// Aggregate stats for a task type from completed history. + pub async fn history_stats(&self, task_type: &str) -> Result { + let row = sqlx::query( + "SELECT + COUNT(*) as total, + COALESCE(AVG(CASE WHEN status = 'completed' THEN duration_ms END), 0.0) as avg_dur, + COALESCE(AVG(CASE WHEN status = 'completed' THEN actual_read_bytes END), 0.0) as avg_read, + COALESCE(AVG(CASE WHEN status = 'completed' THEN actual_write_bytes END), 0.0) as avg_write, + CAST(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) AS REAL) / MAX(COUNT(*), 1) as fail_rate + FROM task_history WHERE task_type = ?", + ) + .bind(task_type) + .fetch_one(&self.pool) + .await?; + + Ok(TypeStats { + count: row.get::("total"), + avg_duration_ms: row.get::("avg_dur"), + avg_read_bytes: row.get::("avg_read"), + avg_write_bytes: row.get::("avg_write"), + failure_rate: row.get::("fail_rate"), + }) + } + + /// Average IO throughput (bytes/sec) for recently completed tasks of a type. + /// Used by the scheduler for IO budget estimation. + pub async fn avg_throughput( + &self, + task_type: &str, + recent_limit: i32, + ) -> Result<(f64, f64), StoreError> { + let row: (f64, f64) = sqlx::query_as( + "SELECT + COALESCE(AVG(CASE WHEN duration_ms > 0 THEN actual_read_bytes * 1000.0 / duration_ms END), 0), + COALESCE(AVG(CASE WHEN duration_ms > 0 THEN actual_write_bytes * 1000.0 / duration_ms END), 0) + FROM ( + SELECT actual_read_bytes, actual_write_bytes, duration_ms + FROM task_history + WHERE task_type = ? AND status = 'completed' AND duration_ms > 0 + ORDER BY completed_at DESC + LIMIT ? + )", + ) + .bind(task_type) + .bind(recent_limit) + .fetch_one(&self.pool) + .await?; + Ok(row) + } + + // ── Unified lookup ────────────────────────────────────────────── + + /// Look up a task by its dedup key, checking the active queue first + /// and falling back to history. + /// + /// This is the low-level building block for [`Scheduler::task_lookup`]. + /// The `key` parameter is the pre-computed SHA-256 dedup key (as + /// returned by [`generate_dedup_key`](crate::task::generate_dedup_key) + /// or [`TaskSubmission::effective_key`]). + pub async fn task_lookup(&self, key: &str) -> Result { + // Check active queue first (pending / running / paused). + if let Some(record) = self.task_by_key(key).await? { + return Ok(TaskLookup::Active(record)); + } + + // Fall back to the most recent history entry. + let row = sqlx::query( + "SELECT * FROM task_history WHERE key = ? ORDER BY completed_at DESC LIMIT 1", + ) + .bind(key) + .fetch_optional(&self.pool) + .await?; + + match row { + Some(r) => Ok(TaskLookup::History(row_to_history_record(&r))), + None => Ok(TaskLookup::NotFound), + } + } + + // ── Pruning ───────────────────────────────────────────────────── + + /// Prune history records older than `max_age_days` days. + /// Returns the number of records deleted. + pub async fn prune_history_by_age(&self, max_age_days: i64) -> Result { + let result = + sqlx::query("DELETE FROM task_history WHERE completed_at < datetime('now', ?)") + .bind(format!("-{max_age_days} days")) + .execute(&self.pool) + .await?; + Ok(result.rows_affected()) + } + + /// Prune history to keep at most `keep_latest` records. + /// Returns the number of records deleted. + pub async fn prune_history_by_count(&self, keep_latest: i64) -> Result { + let result = sqlx::query( + "DELETE FROM task_history WHERE id NOT IN ( + SELECT id FROM task_history ORDER BY completed_at DESC LIMIT ? + )", + ) + .bind(keep_latest) + .execute(&self.pool) + .await?; + Ok(result.rows_affected()) + } + + /// Increment the completion counter and prune every `prune_interval` completions. + /// Errors are logged rather than propagated since the task itself already committed. + async fn maybe_prune(&self) { + if self.retention_policy.is_none() { + return; + } + let count = self.completion_count.fetch_add(1, Ordering::Relaxed); + if count % self.prune_interval != 0 { + return; + } + if let Err(e) = self.auto_prune().await { + tracing::warn!("history prune failed: {e}"); + } + } + + /// Apply the configured retention policy, if any. + async fn auto_prune(&self) -> Result<(), StoreError> { + match &self.retention_policy { + Some(RetentionPolicy::MaxCount(n)) => { + self.prune_history_by_count(*n).await?; + } + Some(RetentionPolicy::MaxAgeDays(days)) => { + self.prune_history_by_age(*days).await?; + } + None => {} + } + Ok(()) + } + + /// Close the store and flush WAL. + pub async fn close(&self) { + // Consolidate the WAL file into the main database before closing. + if let Err(e) = sqlx::raw_sql("PRAGMA wal_checkpoint(TRUNCATE)") + .execute(&self.pool) + .await + { + tracing::warn!(error = %e, "WAL checkpoint failed during close"); + } + self.pool.close().await; + } + + /// Delete a task from the active queue by id. Returns true if a row was deleted. + pub async fn delete(&self, id: i64) -> Result { + let result = sqlx::query("DELETE FROM tasks WHERE id = ?") + .bind(id) + .execute(&self.pool) + .await?; + Ok(result.rows_affected() > 0) + } +} + +// ── Row mapping helpers ───────────────────────────────────────────── + +fn parse_datetime(s: &str) -> DateTime { + // SQLite stores as "YYYY-MM-DD HH:MM:SS". Parse with chrono. + chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") + .map(|ndt| ndt.and_utc()) + .unwrap_or_default() +} + +fn row_to_task_record(row: &sqlx::sqlite::SqliteRow) -> TaskRecord { + let priority_val: i32 = row.get("priority"); + let status_str: String = row.get("status"); + let created_at_str: String = row.get("created_at"); + let started_at_str: Option = row.get("started_at"); + + let requeue_val: i32 = row.get("requeue"); + let requeue_priority_val: Option = row.get("requeue_priority"); + + TaskRecord { + id: row.get("id"), + task_type: row.get("task_type"), + key: row.get("key"), + priority: Priority::new(priority_val as u8), + status: status_str.parse().unwrap_or(TaskStatus::Pending), + payload: row.get("payload"), + expected_read_bytes: row.get("expected_read_bytes"), + expected_write_bytes: row.get("expected_write_bytes"), + retry_count: row.get("retry_count"), + last_error: row.get("last_error"), + created_at: parse_datetime(&created_at_str), + started_at: started_at_str.map(|s| parse_datetime(&s)), + requeue: requeue_val != 0, + requeue_priority: requeue_priority_val.map(|p| Priority::new(p as u8)), + } +} + +fn row_to_history_record(row: &sqlx::sqlite::SqliteRow) -> TaskHistoryRecord { + let priority_val: i32 = row.get("priority"); + let status_str: String = row.get("status"); + let created_at_str: String = row.get("created_at"); + let started_at_str: Option = row.get("started_at"); + let completed_at_str: String = row.get("completed_at"); + + TaskHistoryRecord { + id: row.get("id"), + task_type: row.get("task_type"), + key: row.get("key"), + priority: Priority::new(priority_val as u8), + status: status_str.parse().unwrap_or(HistoryStatus::Failed), + payload: row.get("payload"), + expected_read_bytes: row.get("expected_read_bytes"), + expected_write_bytes: row.get("expected_write_bytes"), + actual_read_bytes: row.get("actual_read_bytes"), + actual_write_bytes: row.get("actual_write_bytes"), + retry_count: row.get("retry_count"), + last_error: row.get("last_error"), + created_at: parse_datetime(&created_at_str), + started_at: started_at_str.map(|s| parse_datetime(&s)), + completed_at: parse_datetime(&completed_at_str), + duration_ms: row.get("duration_ms"), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn test_store() -> TaskStore { + TaskStore::open_memory().await.unwrap() + } + + fn make_submission(key: &str, priority: Priority) -> TaskSubmission { + TaskSubmission { + task_type: "test".into(), + key: Some(key.into()), + priority, + payload: Some(b"hello".to_vec()), + expected_read_bytes: 1000, + expected_write_bytes: 500, + } + } + + #[tokio::test] + async fn submit_and_pop() { + let store = test_store().await; + let sub = make_submission("job-1", Priority::NORMAL); + let expected_key = sub.effective_key(); + + let outcome = store.submit(&sub).await.unwrap(); + assert!(outcome.is_inserted()); + + let task = store.pop_next().await.unwrap().unwrap(); + assert_eq!(task.key, expected_key); + assert_eq!(task.status, TaskStatus::Running); + assert!(task.started_at.is_some()); + } + + #[tokio::test] + async fn dedup_prevents_duplicate_key() { + let store = test_store().await; + let sub = make_submission("dup-key", Priority::NORMAL); + + let first = store.submit(&sub).await.unwrap(); + assert!(first.is_inserted()); + + let second = store.submit(&sub).await.unwrap(); + assert_eq!(second, SubmitOutcome::Duplicate); // same priority → no upgrade + } + + #[tokio::test] + async fn dedup_upgrades_priority() { + let store = test_store().await; + + // Submit at NORMAL priority. + let sub_normal = make_submission("upgrade-me", Priority::NORMAL); + let first = store.submit(&sub_normal).await.unwrap(); + assert!(first.is_inserted()); + + // Submit same key at HIGH priority — should upgrade. + let sub_high = make_submission("upgrade-me", Priority::HIGH); + let second = store.submit(&sub_high).await.unwrap(); + assert!(matches!(second, SubmitOutcome::Upgraded(_))); + + // Verify the stored priority was upgraded. + let key = sub_normal.effective_key(); + let task = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(task.priority, Priority::HIGH); + + // Submit at BACKGROUND (lower importance) — should not upgrade. + let sub_bg = make_submission("upgrade-me", Priority::BACKGROUND); + let third = store.submit(&sub_bg).await.unwrap(); + assert_eq!(third, SubmitOutcome::Duplicate); + + // Priority should still be HIGH. + let task = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(task.priority, Priority::HIGH); + } + + #[tokio::test] + async fn dedup_requeues_when_running() { + let store = test_store().await; + + // Submit and pop (transitions to running). + let sub = make_submission("running-task", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + // Submit same key at HIGH priority — should be Requeued since task is running. + let sub_high = make_submission("running-task", Priority::HIGH); + let outcome = store.submit(&sub_high).await.unwrap(); + assert!(matches!(outcome, SubmitOutcome::Requeued(_))); + + // Verify the requeue flag is set on the running task. + let key = sub.effective_key(); + let running = store.task_by_key(&key).await.unwrap().unwrap(); + assert!(running.requeue); + assert_eq!(running.requeue_priority, Some(Priority::HIGH)); + + // Complete the running task — should reset to pending with requeue_priority. + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }, + ) + .await + .unwrap(); + + // Task should now be pending at HIGH priority. + let requeued = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(requeued.status, TaskStatus::Pending); + assert_eq!(requeued.priority, Priority::HIGH); + assert!(!requeued.requeue); + assert_eq!(requeued.requeue_priority, None); + + // Pop should return it. + let popped = store.pop_next().await.unwrap().unwrap(); + assert_eq!(popped.id, task.id); + } + + #[tokio::test] + async fn dedup_requeue_already_requeued_same_priority() { + let store = test_store().await; + + let sub = make_submission("rq-dup", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + store.pop_next().await.unwrap(); + + // First requeue at HIGH. + let sub_high = make_submission("rq-dup", Priority::HIGH); + let outcome = store.submit(&sub_high).await.unwrap(); + assert!(matches!(outcome, SubmitOutcome::Requeued(_))); + + // Second requeue at same priority — should be Duplicate. + let outcome2 = store.submit(&sub_high).await.unwrap(); + assert_eq!(outcome2, SubmitOutcome::Duplicate); + } + + #[tokio::test] + async fn dedup_requeue_upgrades_priority() { + let store = test_store().await; + + let sub = make_submission("rq-upgrade", Priority::BACKGROUND); + store.submit(&sub).await.unwrap(); + store.pop_next().await.unwrap(); + + // First requeue at NORMAL. + let sub_normal = make_submission("rq-upgrade", Priority::NORMAL); + let outcome = store.submit(&sub_normal).await.unwrap(); + assert!(matches!(outcome, SubmitOutcome::Requeued(_))); + + // Second requeue at HIGH — should upgrade requeue_priority. + let sub_high = make_submission("rq-upgrade", Priority::HIGH); + let outcome2 = store.submit(&sub_high).await.unwrap(); + assert!(matches!(outcome2, SubmitOutcome::Requeued(_))); + + let key = sub.effective_key(); + let task = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(task.requeue_priority, Some(Priority::HIGH)); + } + + #[tokio::test] + async fn permanent_failure_drops_requeue() { + let store = test_store().await; + + let sub = make_submission("fail-rq", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + // Mark for requeue. + let sub_high = make_submission("fail-rq", Priority::HIGH); + store.submit(&sub_high).await.unwrap(); + + // Permanent failure — requeue flag is dropped. + store.fail(task.id, "boom", false, 0, 0, 0).await.unwrap(); + + // Key should be free for reuse. + let outcome = store.submit(&sub).await.unwrap(); + assert!(outcome.is_inserted()); + } + + #[tokio::test] + async fn dedup_allows_same_key_different_types() { + let store = test_store().await; + + let sub_a = TaskSubmission { + task_type: "type_a".into(), + key: Some("shared-key".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }; + let sub_b = TaskSubmission { + task_type: "type_b".into(), + key: Some("shared-key".into()), + priority: Priority::NORMAL, + payload: None, + expected_read_bytes: 0, + expected_write_bytes: 0, + }; + + let first = store.submit(&sub_a).await.unwrap(); + assert!(first.is_inserted()); + + // Same logical key, different task type — should NOT dedup. + let second = store.submit(&sub_b).await.unwrap(); + assert!(second.is_inserted()); + } + + #[tokio::test] + async fn dedup_by_payload_when_no_key() { + let store = test_store().await; + + let sub = TaskSubmission { + task_type: "ingest".into(), + key: None, + priority: Priority::NORMAL, + payload: Some(b"same-data".to_vec()), + expected_read_bytes: 0, + expected_write_bytes: 0, + }; + + let first = store.submit(&sub).await.unwrap(); + assert!(first.is_inserted()); + + // Same type + payload → dedup. + let second = store.submit(&sub).await.unwrap(); + assert_eq!(second, SubmitOutcome::Duplicate); + + // Different payload → no dedup. + let sub2 = TaskSubmission { + payload: Some(b"different-data".to_vec()), + ..sub.clone() + }; + let third = store.submit(&sub2).await.unwrap(); + assert!(third.is_inserted()); + } + + #[tokio::test] + async fn priority_ordering() { + let store = test_store().await; + + let bg = make_submission("bg", Priority::BACKGROUND); + let rt = make_submission("rt", Priority::REALTIME); + let normal = make_submission("normal", Priority::NORMAL); + + let bg_key = bg.effective_key(); + let rt_key = rt.effective_key(); + let normal_key = normal.effective_key(); + + store.submit(&bg).await.unwrap(); + store.submit(&rt).await.unwrap(); + store.submit(&normal).await.unwrap(); + + let first = store.pop_next().await.unwrap().unwrap(); + assert_eq!(first.key, rt_key); + + let second = store.pop_next().await.unwrap().unwrap(); + assert_eq!(second.key, normal_key); + + let third = store.pop_next().await.unwrap().unwrap(); + assert_eq!(third.key, bg_key); + } + + #[tokio::test] + async fn complete_moves_to_history() { + let store = test_store().await; + let sub = make_submission("done", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 2000, + actual_write_bytes: 1000, + }, + ) + .await + .unwrap(); + + // Should be gone from active queue. + assert!(store.task_by_key(&key).await.unwrap().is_none()); + + // Should be in history. + let hist = store.history_by_key(&key).await.unwrap(); + assert_eq!(hist.len(), 1); + assert_eq!(hist[0].status, HistoryStatus::Completed); + assert_eq!(hist[0].actual_read_bytes, Some(2000)); + } + + #[tokio::test] + async fn fail_retryable_requeues() { + let store = test_store().await; + let sub = make_submission("retry-me", Priority::HIGH); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + store + .fail(task.id, "transient error", true, 3, 0, 0) + .await + .unwrap(); + + // Should still be in active queue as pending with retry_count=1. + let requeued = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(requeued.status, TaskStatus::Pending); + assert_eq!(requeued.retry_count, 1); + assert_eq!(requeued.last_error.as_deref(), Some("transient error")); + } + + #[tokio::test] + async fn fail_exhausted_retries_moves_to_history() { + let store = test_store().await; + let sub = make_submission("permanent", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + // First fail: retry_count 0 < 1, requeued with retry_count=1. + store.fail(task.id, "err1", true, 1, 0, 0).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + assert_eq!(task.retry_count, 1); + // Second fail: retry_count 1 >= max_retries 1, moves to history. + store.fail(task.id, "err2", true, 1, 100, 50).await.unwrap(); + + // Should be in history now. + assert!(store.task_by_key(&key).await.unwrap().is_none()); + let hist = store.failed_tasks(10).await.unwrap(); + assert_eq!(hist.len(), 1); + assert_eq!(hist[0].status, HistoryStatus::Failed); + } + + #[tokio::test] + async fn payload_size_limit() { + let store = test_store().await; + let mut sub = make_submission("big", Priority::NORMAL); + sub.payload = Some(vec![0u8; MAX_PAYLOAD_BYTES + 1]); + + let err = store.submit(&sub).await.unwrap_err(); + assert!(matches!(err, StoreError::PayloadTooLarge)); + } + + #[tokio::test] + async fn running_io_totals() { + let store = test_store().await; + + let mut sub = make_submission("io-1", Priority::NORMAL); + sub.expected_read_bytes = 5000; + sub.expected_write_bytes = 2000; + store.submit(&sub).await.unwrap(); + + let mut sub2 = make_submission("io-2", Priority::NORMAL); + sub2.expected_read_bytes = 3000; + sub2.expected_write_bytes = 1000; + store.submit(&sub2).await.unwrap(); + + // Pop both so they're running. + store.pop_next().await.unwrap(); + store.pop_next().await.unwrap(); + + let (read, write) = store.running_io_totals().await.unwrap(); + assert_eq!(read, 8000); + assert_eq!(write, 3000); + } + + #[tokio::test] + async fn key_freed_after_completion() { + let store = test_store().await; + let sub = make_submission("reuse", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }, + ) + .await + .unwrap(); + + // Key should be free for reuse. + let outcome = store.submit(&sub).await.unwrap(); + assert!(outcome.is_inserted()); + } + + #[tokio::test] + async fn history_stats_computation() { + let store = test_store().await; + + // Complete a few tasks. + for i in 0..3 { + let sub = make_submission(&format!("stat-{i}"), Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 1000, + actual_write_bytes: 500, + }, + ) + .await + .unwrap(); + } + + let stats = store.history_stats("test").await.unwrap(); + assert_eq!(stats.count, 3); + assert!(stats.failure_rate == 0.0); + } + + #[tokio::test] + async fn pause_and_resume() { + let store = test_store().await; + store + .submit(&make_submission("pausable", Priority::NORMAL)) + .await + .unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + store.pause(task.id).await.unwrap(); + let paused = store.paused_tasks().await.unwrap(); + assert_eq!(paused.len(), 1); + assert_eq!(paused[0].status, TaskStatus::Paused); + + store.resume(task.id).await.unwrap(); + let pending = store.pending_tasks(10).await.unwrap(); + assert_eq!(pending.len(), 1); + assert_eq!(pending[0].status, TaskStatus::Pending); + } + + #[tokio::test] + async fn open_with_custom_config() { + let store = TaskStore::open_memory().await.unwrap(); + // Basic smoke test — store is usable. + let count = store.pending_count().await.unwrap(); + assert_eq!(count, 0); + } + + #[tokio::test] + async fn delete_task() { + let store = test_store().await; + let sub = make_submission("del-me", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + + let task = store.task_by_key(&key).await.unwrap().unwrap(); + assert!(store.delete(task.id).await.unwrap()); + assert!(store.task_by_key(&key).await.unwrap().is_none()); + + // Deleting again returns false. + assert!(!store.delete(task.id).await.unwrap()); + } + + #[tokio::test] + async fn task_by_id_lookup() { + let store = test_store().await; + let sub = make_submission("by-id", Priority::NORMAL); + let id = store.submit(&sub).await.unwrap().id().unwrap(); + + let task = store.task_by_id(id).await.unwrap().unwrap(); + assert_eq!(task.id, id); + assert_eq!(task.key, sub.effective_key()); + + // Non-existent id returns None. + assert!(store.task_by_id(9999).await.unwrap().is_none()); + } + + #[tokio::test] + async fn history_by_id_lookup() { + let store = test_store().await; + let sub = make_submission("hist-id", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 100, + actual_write_bytes: 50, + }, + ) + .await + .unwrap(); + + // Fetch from history by key to get the history id. + let hist = store.history_by_key(&sub.effective_key()).await.unwrap(); + assert_eq!(hist.len(), 1); + let hist_id = hist[0].id; + + let record = store.history_by_id(hist_id).await.unwrap().unwrap(); + assert_eq!(record.key, sub.effective_key()); + assert_eq!(record.actual_read_bytes, Some(100)); + + // Non-existent id returns None. + assert!(store.history_by_id(9999).await.unwrap().is_none()); + } + + #[tokio::test] + async fn requeue_running_task() { + let store = test_store().await; + let sub = make_submission("rq", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + assert_eq!(task.status, TaskStatus::Running); + + store.requeue(task.id).await.unwrap(); + let t = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(t.status, TaskStatus::Pending); + assert!(t.started_at.is_none()); + } + + #[tokio::test] + async fn peek_next_does_not_modify_status() { + let store = test_store().await; + let sub = make_submission("peek-me", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + + // Peek should return the task but leave it pending. + let peeked = store.peek_next().await.unwrap().unwrap(); + assert_eq!(peeked.key, key); + assert_eq!(peeked.status, TaskStatus::Pending); + + // Verify it's still pending in the store. + let t = store.task_by_key(&key).await.unwrap().unwrap(); + assert_eq!(t.status, TaskStatus::Pending); + assert!(t.started_at.is_none()); + + // Peeking again returns the same task. + let peeked2 = store.peek_next().await.unwrap().unwrap(); + assert_eq!(peeked2.id, peeked.id); + } + + #[tokio::test] + async fn peek_next_empty_queue() { + let store = test_store().await; + assert!(store.peek_next().await.unwrap().is_none()); + } + + #[tokio::test] + async fn pop_by_id_claims_pending_task() { + let store = test_store().await; + let sub = make_submission("claim-me", Priority::NORMAL); + let key = sub.effective_key(); + let id = store.submit(&sub).await.unwrap().id().unwrap(); + + let task = store.pop_by_id(id).await.unwrap().unwrap(); + assert_eq!(task.key, key); + assert_eq!(task.status, TaskStatus::Running); + assert!(task.started_at.is_some()); + } + + #[tokio::test] + async fn pop_by_id_returns_none_if_already_running() { + let store = test_store().await; + let sub = make_submission("already-taken", Priority::NORMAL); + store.submit(&sub).await.unwrap(); + + // Pop via pop_next first. + let task = store.pop_next().await.unwrap().unwrap(); + + // pop_by_id on the same task should return None (already running). + assert!(store.pop_by_id(task.id).await.unwrap().is_none()); + } + + #[tokio::test] + async fn pop_by_id_returns_none_for_nonexistent() { + let store = test_store().await; + assert!(store.pop_by_id(9999).await.unwrap().is_none()); + } + + #[tokio::test] + async fn peek_then_pop_by_id_workflow() { + let store = test_store().await; + let sub = make_submission("peek-pop", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + + // Peek, then claim. + let peeked = store.peek_next().await.unwrap().unwrap(); + let claimed = store.pop_by_id(peeked.id).await.unwrap().unwrap(); + assert_eq!(claimed.key, key); + assert_eq!(claimed.status, TaskStatus::Running); + + // Queue should now be empty for peek. + assert!(store.peek_next().await.unwrap().is_none()); + } + + #[tokio::test] + async fn prune_by_count() { + let store = test_store().await; + + // Complete 5 tasks. + for i in 0..5 { + let sub = make_submission(&format!("prune-{i}"), Priority::NORMAL); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }, + ) + .await + .unwrap(); + } + + let hist = store.history(100, 0).await.unwrap(); + assert_eq!(hist.len(), 5); + + let deleted = store.prune_history_by_count(3).await.unwrap(); + assert_eq!(deleted, 2); + + let hist = store.history(100, 0).await.unwrap(); + assert_eq!(hist.len(), 3); + } + + #[tokio::test] + async fn submit_batch_inserts_all() { + let store = test_store().await; + let subs: Vec<_> = (0..5) + .map(|i| make_submission(&format!("batch-{i}"), Priority::NORMAL)) + .collect(); + + let results = store.submit_batch(&subs).await.unwrap(); + assert_eq!(results.len(), 5); + assert!(results.iter().all(|r| r.is_inserted())); + + let count = store.pending_count().await.unwrap(); + assert_eq!(count, 5); + } + + #[tokio::test] + async fn submit_batch_dedup() { + let store = test_store().await; + let sub = make_submission("dup", Priority::NORMAL); + + let results = store + .submit_batch(&[sub.clone(), sub.clone()]) + .await + .unwrap(); + assert!(results[0].is_inserted()); + assert_eq!(results[1], SubmitOutcome::Duplicate); // dedup within same batch + + // Submitting again should also dedup. + let results = store.submit_batch(&[sub]).await.unwrap(); + assert_eq!(results[0], SubmitOutcome::Duplicate); + } + + #[tokio::test] + async fn submit_batch_empty() { + let store = test_store().await; + let results = store.submit_batch(&[]).await.unwrap(); + assert!(results.is_empty()); + } + + #[tokio::test] + async fn task_lookup_active() { + let store = test_store().await; + let sub = make_submission("lookup-active", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + + let result = store.task_lookup(&key).await.unwrap(); + assert!(matches!(result, TaskLookup::Active(ref r) if r.status == TaskStatus::Pending)); + + // Pop so it's running. + store.pop_next().await.unwrap(); + let result = store.task_lookup(&key).await.unwrap(); + assert!(matches!(result, TaskLookup::Active(ref r) if r.status == TaskStatus::Running)); + } + + #[tokio::test] + async fn task_lookup_history() { + let store = test_store().await; + let sub = make_submission("lookup-hist", Priority::NORMAL); + let key = sub.effective_key(); + store.submit(&sub).await.unwrap(); + let task = store.pop_next().await.unwrap().unwrap(); + store + .complete( + task.id, + &TaskResult { + actual_read_bytes: 0, + actual_write_bytes: 0, + }, + ) + .await + .unwrap(); + + let result = store.task_lookup(&key).await.unwrap(); + assert!( + matches!(result, TaskLookup::History(ref r) if r.status == HistoryStatus::Completed) + ); + } + + #[tokio::test] + async fn task_lookup_not_found() { + let store = test_store().await; + let key = crate::task::generate_dedup_key("nope", Some(b"nope")); + let result = store.task_lookup(&key).await.unwrap(); + assert!(matches!(result, TaskLookup::NotFound)); + } + + #[tokio::test] + async fn submit_batch_rejects_oversized_payload() { + let store = test_store().await; + let sub = make_submission("ok", Priority::NORMAL); + let big = TaskSubmission { + task_type: "test".into(), + key: Some("big".into()), + priority: Priority::NORMAL, + payload: Some(vec![0u8; MAX_PAYLOAD_BYTES + 1]), + expected_read_bytes: 0, + expected_write_bytes: 0, + }; + + // The oversized payload should fail the entire batch — no partial inserts. + let err = store.submit_batch(&[sub.clone(), big]).await.unwrap_err(); + assert!(matches!(err, StoreError::PayloadTooLarge)); + + // The first task should NOT have been committed (transaction rolled back). + let count = store.pending_count().await.unwrap(); + assert_eq!(count, 0); + } +} diff --git a/src/task.rs b/src/task.rs new file mode 100644 index 0000000..c75d13c --- /dev/null +++ b/src/task.rs @@ -0,0 +1,403 @@ +use chrono::{DateTime, Utc}; +use serde::de::DeserializeOwned; +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use crate::priority::Priority; + +/// Maximum payload size in bytes (1 MiB). +pub const MAX_PAYLOAD_BYTES: usize = 1_048_576; + +/// Lifecycle state of a task in the active queue. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TaskStatus { + Pending, + Running, + Paused, +} + +impl TaskStatus { + pub fn as_str(self) -> &'static str { + match self { + Self::Pending => "pending", + Self::Running => "running", + Self::Paused => "paused", + } + } +} + +impl std::str::FromStr for TaskStatus { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "pending" => Ok(Self::Pending), + "running" => Ok(Self::Running), + "paused" => Ok(Self::Paused), + other => Err(format!("unknown TaskStatus: {other}")), + } + } +} + +/// Terminal state of a task in history. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum HistoryStatus { + Completed, + Failed, +} + +impl HistoryStatus { + pub fn as_str(self) -> &'static str { + match self { + Self::Completed => "completed", + Self::Failed => "failed", + } + } +} + +impl std::str::FromStr for HistoryStatus { + type Err = String; + + fn from_str(s: &str) -> Result { + match s { + "completed" => Ok(Self::Completed), + "failed" => Ok(Self::Failed), + other => Err(format!("unknown HistoryStatus: {other}")), + } + } +} + +/// A task in the active queue (pending, running, or paused). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TaskRecord { + pub id: i64, + pub task_type: String, + pub key: String, + pub priority: Priority, + pub status: TaskStatus, + pub payload: Option>, + pub expected_read_bytes: i64, + pub expected_write_bytes: i64, + pub retry_count: i32, + pub last_error: Option, + pub created_at: DateTime, + pub started_at: Option>, + pub requeue: bool, + pub requeue_priority: Option, +} + +impl TaskRecord { + /// Deserialize the payload blob into a typed value. + /// + /// Returns `None` if the payload is absent, or an error if deserialization fails. + pub fn deserialize_payload( + &self, + ) -> Result, serde_json::Error> { + match &self.payload { + Some(bytes) => serde_json::from_slice(bytes).map(Some), + None => Ok(None), + } + } +} + +/// A task that has completed or permanently failed. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TaskHistoryRecord { + pub id: i64, + pub task_type: String, + pub key: String, + pub priority: Priority, + pub status: HistoryStatus, + pub payload: Option>, + pub expected_read_bytes: i64, + pub expected_write_bytes: i64, + pub actual_read_bytes: Option, + pub actual_write_bytes: Option, + pub retry_count: i32, + pub last_error: Option, + pub created_at: DateTime, + pub started_at: Option>, + pub completed_at: DateTime, + pub duration_ms: Option, +} + +/// Reported by the executor on successful completion. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TaskResult { + pub actual_read_bytes: i64, + pub actual_write_bytes: i64, +} + +/// Reported by the executor on failure. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TaskError { + pub message: String, + pub retryable: bool, + pub actual_read_bytes: i64, + pub actual_write_bytes: i64, +} + +impl std::fmt::Display for TaskError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for TaskError {} + +/// Result of a task submission attempt. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum SubmitOutcome { + /// Task was inserted as new. + Inserted(i64), + /// Duplicate key existed; its priority was upgraded (pending/paused tasks only). + Upgraded(i64), + /// Duplicate key existed and is running/paused; marked for re-queue after completion. + Requeued(i64), + /// Duplicate key existed; no changes were made. + Duplicate, +} + +impl SubmitOutcome { + /// Returns the task ID if the task was inserted, upgraded, or requeued. + pub fn id(&self) -> Option { + match self { + Self::Inserted(id) | Self::Upgraded(id) | Self::Requeued(id) => Some(*id), + Self::Duplicate => None, + } + } + + /// Returns `true` if a new task was inserted. + pub fn is_inserted(&self) -> bool { + matches!(self, Self::Inserted(_)) + } +} + +/// Generate a dedup key by hashing the task type and payload. +/// +/// Produces a hex-encoded SHA-256 digest of `task_type` concatenated with +/// the payload bytes (or an empty slice when there is no payload). +pub fn generate_dedup_key(task_type: &str, payload: Option<&[u8]>) -> String { + let mut hasher = Sha256::new(); + hasher.update(task_type.as_bytes()); + hasher.update(b":"); + if let Some(p) = payload { + hasher.update(p); + } + format!("{:x}", hasher.finalize()) +} + +/// Parameters for submitting a new task. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TaskSubmission { + pub task_type: String, + /// Optional dedup key. When `None`, the key is auto-generated by hashing + /// `task_type` and `payload`, so two submissions with the same type and + /// payload are deduplicated automatically. + pub key: Option, + pub priority: Priority, + pub payload: Option>, + pub expected_read_bytes: i64, + pub expected_write_bytes: i64, +} + +impl TaskSubmission { + /// Resolve the effective dedup key. Always incorporates the task type + /// so different task types never collide, even with the same logical key. + /// + /// - Explicit key: `hash(task_type + ":" + key)` + /// - No key: `hash(task_type + ":" + payload)` + pub fn effective_key(&self) -> String { + match &self.key { + Some(k) => generate_dedup_key(&self.task_type, Some(k.as_bytes())), + None => generate_dedup_key(&self.task_type, self.payload.as_deref()), + } + } + + /// Create a submission with a typed payload serialized to JSON bytes. + /// + /// The dedup key is auto-generated from the task type and serialized payload. + /// Use `TaskRecord::deserialize_payload()` on the executor side to recover the type. + pub fn with_payload( + task_type: &str, + priority: Priority, + data: &T, + expected_read_bytes: i64, + expected_write_bytes: i64, + ) -> Result { + let payload = serde_json::to_vec(data)?; + Ok(Self { + task_type: task_type.to_string(), + key: None, + priority, + payload: Some(payload), + expected_read_bytes, + expected_write_bytes, + }) + } +} + +/// A strongly-typed task that bundles serialization, task type name, and default +/// IO estimates. +/// +/// Implementing this trait collapses the 6 fields of [`TaskSubmission`] into a +/// derive-friendly pattern. Use [`Scheduler::submit_typed`] to submit and +/// [`TaskContext::deserialize_typed`] on the executor side. +/// +/// # Example +/// +/// ```ignore +/// use serde::{Serialize, Deserialize}; +/// use taskmill::{TypedTask, Priority}; +/// +/// #[derive(Serialize, Deserialize)] +/// struct Thumbnail { path: String, size: u32 } +/// +/// impl TypedTask for Thumbnail { +/// const TASK_TYPE: &'static str = "thumbnail"; +/// fn expected_read_bytes(&self) -> i64 { 4096 } +/// fn expected_write_bytes(&self) -> i64 { 1024 } +/// } +/// ``` +pub trait TypedTask: Serialize + DeserializeOwned + Send + 'static { + /// Unique name used to register and look up the executor. + const TASK_TYPE: &'static str; + + /// Estimated bytes this task will read. Default: 0. + fn expected_read_bytes(&self) -> i64 { + 0 + } + + /// Estimated bytes this task will write. Default: 0. + fn expected_write_bytes(&self) -> i64 { + 0 + } + + /// Scheduling priority. Default: [`Priority::NORMAL`]. + fn priority(&self) -> Priority { + Priority::NORMAL + } +} + +impl TaskSubmission { + /// Create a submission from a [`TypedTask`], serializing the payload and + /// pulling task type, priority, and IO estimates from the trait. + pub fn from_typed(task: &T) -> Result { + let payload = serde_json::to_vec(task)?; + Ok(Self { + task_type: T::TASK_TYPE.to_string(), + key: None, + priority: task.priority(), + payload: Some(payload), + expected_read_bytes: task.expected_read_bytes(), + expected_write_bytes: task.expected_write_bytes(), + }) + } +} + +/// Unified lookup result for querying a task by its dedup inputs. +/// +/// Returned by [`TaskStore::task_lookup`] and [`Scheduler::task_lookup`]. +/// Tells the caller whether a task is currently active (pending, running, +/// or paused) or has finished (completed or failed), without requiring +/// them to manually compute the dedup key or query two tables. +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(tag = "location", content = "record")] +pub enum TaskLookup { + /// Task is in the active queue (pending, running, or paused). + Active(TaskRecord), + /// Task has finished and is in the history table. + /// Contains the most recent history entry for that key. + History(TaskHistoryRecord), + /// No task with this key exists in either table. + NotFound, +} + +/// Aggregate statistics for a task type from history. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct TypeStats { + pub count: i64, + pub avg_duration_ms: f64, + pub avg_read_bytes: f64, + pub avg_write_bytes: f64, + pub failure_rate: f64, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Serialize, Deserialize, Debug, PartialEq)] + struct Thumbnail { + path: String, + size: u32, + } + + impl TypedTask for Thumbnail { + const TASK_TYPE: &'static str = "thumbnail"; + + fn expected_read_bytes(&self) -> i64 { + 4096 + } + + fn expected_write_bytes(&self) -> i64 { + 1024 + } + } + + #[test] + fn typed_task_to_submission() { + let task = Thumbnail { + path: "/photos/a.jpg".into(), + size: 256, + }; + let sub = TaskSubmission::from_typed(&task).unwrap(); + + assert_eq!(sub.task_type, "thumbnail"); + assert_eq!(sub.priority, Priority::NORMAL); + assert_eq!(sub.expected_read_bytes, 4096); + assert_eq!(sub.expected_write_bytes, 1024); + assert!(sub.key.is_none()); + + // Payload round-trips correctly. + let recovered: Thumbnail = serde_json::from_slice(sub.payload.as_ref().unwrap()).unwrap(); + assert_eq!(recovered, task); + } + + #[test] + fn typed_task_custom_priority() { + #[derive(Serialize, Deserialize)] + struct Urgent { + id: u64, + } + + impl TypedTask for Urgent { + const TASK_TYPE: &'static str = "urgent"; + + fn priority(&self) -> Priority { + Priority::HIGH + } + } + + let sub = TaskSubmission::from_typed(&Urgent { id: 42 }).unwrap(); + assert_eq!(sub.priority, Priority::HIGH); + assert_eq!(sub.task_type, "urgent"); + } + + #[test] + fn typed_task_defaults() { + #[derive(Serialize, Deserialize)] + struct Minimal; + + impl TypedTask for Minimal { + const TASK_TYPE: &'static str = "minimal"; + } + + let sub = TaskSubmission::from_typed(&Minimal).unwrap(); + assert_eq!(sub.expected_read_bytes, 0); + assert_eq!(sub.expected_write_bytes, 0); + assert_eq!(sub.priority, Priority::NORMAL); + } +}