diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..b353dd9
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,84 @@
+name: Docs
+
+on:
+ push:
+ branches: [main]
+ paths:
+ - 'book/**'
+ - 'docs/**'
+ - '.github/workflows/docs.yml'
+ - 'CHANGELOG.md'
+ - 'CONTRIBUTING.md'
+ - 'README.md'
+ pull_request:
+ paths:
+ - 'book/**'
+ - 'docs/**'
+ - '.github/workflows/docs.yml'
+ - 'CHANGELOG.md'
+ - 'CONTRIBUTING.md'
+ - 'README.md'
+ workflow_dispatch:
+
+permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+concurrency:
+ group: docs-${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - uses: dtolnay/rust-toolchain@stable
+
+ # Bump cache key when any tool version in the install step changes
+ - name: Cache mdbook binaries
+ id: cache-mdbook
+ uses: actions/cache@v5
+ with:
+ path: ~/.cargo/bin/mdbook*
+ key: mdbook-v2-${{ hashFiles('.github/workflows/docs.yml') }}
+
+ - name: Install mdbook and plugins
+ if: steps.cache-mdbook.outputs.cache-hit != 'true'
+ run: |
+ cargo install \
+ mdbook@0.4.40 \
+ mdbook-linkcheck@0.7.7 \
+ mdbook-toc@0.14.2 \
+ mdbook-admonish@1.18.0 \
+ mdbook-mermaid@0.14.1
+
+ - name: Build book
+ run: mdbook build book
+
+ - name: Upload Pages artifact
+ if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+ uses: actions/upload-pages-artifact@v4
+ with:
+ path: target/book/html
+
+ - name: Verify build (PR)
+ if: github.event_name == 'pull_request'
+ run: |
+ test -f target/book/html/index.html
+ test -s target/book/html/index.html
+ echo "Build OK"
+
+ deploy:
+ needs: build
+ if: github.event_name == 'push' || github.event_name == 'workflow_dispatch'
+ runs-on: ubuntu-latest
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ steps:
+ - name: Deploy to GitHub Pages
+ id: deployment
+ uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
index 416c1dd..4aa5334 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,9 @@ result-*
# Ruff cache (leftover from Python tooling)
.ruff_cache/
+# mdBook build output
+/target/book/
+
# integration test artifacts
tests/integration/level1/fixtures/*.flow
tests/integration/level1/out/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ea1c55c..2d855a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,3 @@
-# Changelog
-
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 81fa319..d50a310 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,3 @@
-# Contributing — Local Testing Guide
-
This document covers how to run the three test tracks locally.
## Prerequisites
diff --git a/Cargo.toml b/Cargo.toml
index 7f3d875..045a156 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ exclude = [
"docs/demo.mp4", ".github/**", "scripts/**",
"flake.nix", "flake.lock", ".envrc", ".direnv/**",
".sisyphus/**", ".ruff_cache/**",
+ "book/**", "docs/**",
]
[[bin]]
diff --git a/README.md b/README.md
index ab7bffe..1ac5ef1 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@ A Rust rewrite of [mitmproxy2swagger](https://github.com/alufers/mitmproxy2swagg
[](https://crates.io/crates/mitm2openapi)
[](https://crates.io/crates/mitm2openapi)
[](https://docs.rs/mitm2openapi)
+[](https://arkptz.github.io/mitm2openapi/)
[](LICENSE)
@@ -39,17 +40,13 @@ Credit to [@alufers](https://github.com/alufers) for the original tool that pion
## Installation
-### From binary releases
-
-Download a pre-built binary from [GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases).
-
-### From source
-
```bash
-cargo install --git https://github.com/Arkptz/mitm2openapi
+cargo install mitm2openapi
```
-## Quick Start
+Or download a pre-built binary from [GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases).
+
+## Quick start
```bash
# 1. Capture traffic with mitmproxy
@@ -64,206 +61,13 @@ mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.
mitm2openapi generate -i capture.flow -t templates.yaml -o openapi.yaml -p "https://api.example.com"
```
-### Skip the manual edit
-
-If you know which paths you care about up front, use `--exclude-patterns`
-and `--include-patterns` to let `discover` do the curation:
-
-```bash
-mitm2openapi discover \
- -i capture.flow -o templates.yaml -p "https://api.example.com" \
- --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg' \
- --include-patterns '/api/**,/v2/**'
-
-mitm2openapi generate \
- -i capture.flow -t templates.yaml -o openapi.yaml -p "https://api.example.com"
-```
-
-Paths matching `--include-patterns` are auto-activated (emitted without
-the `ignore:` prefix). Paths matching `--exclude-patterns` are dropped
-entirely. Everything else still gets `ignore:` for manual review.
-
-
-CLI Reference (click to expand)
-
-### `discover`
-
-Scan captured traffic and produce a templates file listing all observed endpoints.
-
-```
-mitm2openapi discover [OPTIONS] -i -o
-
-## Resource Limits
-
-To prevent denial-of-service when processing untrusted captures, `mitm2openapi`
-enforces several configurable limits:
-
-| Flag | Default | Purpose |
-|------|---------|---------|
-| `--max-input-size` | 2 GiB | Reject files larger than this before reading |
-| `--max-payload-size` | 256 MiB | Cap on individual tnetstring payload allocation |
-| `--max-depth` | 256 | Recursion depth limit for nested tnetstring structures |
-| `--max-body-size` | 64 MiB | Maximum request/response body considered during schema inference |
-| `--allow-symlinks` | off | By default, symlinked inputs are rejected to prevent path-traversal on shared CI runners |
-
-In addition to the configurable limits above, the following per-field caps are
-applied unconditionally to prevent data corruption:
-
-| Field | Cap | Behaviour |
-|-------|-----|-----------|
-| Header name | 8 KiB | Dropped (other headers still processed) |
-| Header value | 64 KiB | Truncated to cap |
-| Form fields per request | 1 000 | Excess fields ignored |
-| URL scheme | `http` / `https` only | Non-HTTP flows silently skipped |
-| Port number | 1–65 535 | Out-of-range port drops the request |
-| HTTP status code | 100–599 | Invalid codes treated as no response |
-
-Identity fields (scheme, host, path, method, header names) require valid UTF-8.
-Flows with non-UTF-8 identity bytes are skipped to prevent data aliasing through
-replacement-character collisions. Control characters in paths are stripped
-automatically.
-
-Increase `--max-input-size` if you work with captures larger than 2 GiB (e.g.
-`--max-input-size 8GiB`). The other limits rarely need tuning.
-
-Both mitmproxy flow files and HAR files are processed incrementally — memory usage
-stays bounded regardless of input size.
-
-## Diagnostics
-
-When the tnetstring parser encounters corruption in a mitmproxy flow file, it
-halts and emits a warn-level log with the byte offset, number of successfully
-parsed entries, and an error classification. No resync is attempted — binary
-payloads can contain bytes that mimic valid tnetstring length prefixes, so
-scanning forward would produce phantom flows.
-
-### Structured report (`--report`)
-
-Pass `--report ` to either `discover` or `generate` to write a JSON
-processing summary. This is useful for CI pipelines that need structured data
-instead of log scraping.
-
-```json
-{
- "report_version": 1,
- "tool_version": "0.2.3",
- "input": {
- "path": "capture.flow",
- "format": "Auto",
- "size_bytes": 102400
- },
- "result": {
- "flows_read": 150,
- "flows_emitted": 148,
- "paths_in_spec": 12
- },
- "events": {
- "parse_error": {
- "TNetString parse error at byte 98304: unexpected end of input": 1
- }
- }
-}
-```
-
-### Strict mode
+## Documentation
-Pass `--strict` to either `discover` or `generate` to treat any warning-level
-event as a hard failure. The process exits with code 2 if any resource cap
-fired, a flow was rejected, or a parse error was encountered.
-
-This is designed for CI gates where silent degradation is unacceptable:
-
-```bash
-mitm2openapi discover -i capture.flow -o templates.yaml -p https://api.example.com --strict \
- || echo "FAIL: corrupt or over-limit flows detected"
-```
-
-Without `--strict`, the same conditions are logged at warn level and processing
-continues (exit code 0).
-
-## Supported Formats
-
-| Format | Versions | Extension |
-|--------|----------|-----------|
-| mitmproxy flow dumps | v19, v20, v21 | `.flow` |
-| HAR (HTTP Archive) | 1.2 (incrementally parsed) | `.har` |
-
-Format is auto-detected from file content. Use `--format` to override.
-
-## Migration from Python mitmproxy2swagger
-
-| Python (`mitmproxy2swagger`) | Rust (`mitm2openapi`) |
-|-----|-----|
-| `pip install mitmproxy2swagger` | Single binary, no runtime |
-| `mitmproxy2swagger -i -o -p ` | Two-step: `discover` then `generate` |
-| Edits spec file in-place | Separate templates file for curation |
-| Requires Python 3.x + mitmproxy | Standalone binary |
-| Supports mitmproxy only | Supports mitmproxy flow dumps + HAR |
-
-### Key differences
-
-- **Two-step workflow**: `discover` produces a templates file; you curate it; `generate` produces the final spec. This separates endpoint selection from spec generation.
-- **Templates file**: Discovered endpoints are prefixed with `ignore:`. Remove the prefix to include an endpoint. This replaces editing the output spec directly.
-- **No Python dependency**: Ships as a single static binary for Linux, macOS, and Windows.
-- **HAR support**: Process HAR exports from browser DevTools or other HTTP tools.
+Full documentation at **[arkptz.github.io/mitm2openapi](https://arkptz.github.io/mitm2openapi/)** — covers installation, traffic capture setup, the full discover → curate → generate pipeline, CLI reference, resource limits, filtering, strict mode, format details, benchmarks, and security model.
## Benchmarks
-Automated CI benchmark runs weekly against the Python original
-([`mitmproxy2swagger`](https://github.com/alufers/mitmproxy2swagger)). See
-[docs/benchmarks.md](docs/benchmarks.md) for the latest timing and memory
-comparison on a ~80 MB synthetic capture, or
-trigger a fresh run via
-[Actions → Benchmark](../../actions/workflows/bench.yml).
-
-Reproduce locally with the commands documented in the workflow file.
+Automated CI benchmarks run weekly against the Python original. See [docs/benchmarks.md](docs/benchmarks.md) for the latest comparison on a ~80 MB synthetic capture.
## Contributing
diff --git a/book/book.toml b/book/book.toml
new file mode 100644
index 0000000..5567b70
--- /dev/null
+++ b/book/book.toml
@@ -0,0 +1,58 @@
+[book]
+title = "mitm2openapi"
+authors = ["Arkptz"]
+description = "Convert mitmproxy flow dumps and HAR files to OpenAPI 3.0 specs"
+src = "src"
+language = "en"
+
+[build]
+build-dir = "../target/book"
+create-missing = false
+
+[output.html]
+git-repository-url = "https://github.com/Arkptz/mitm2openapi"
+edit-url-template = "https://github.com/Arkptz/mitm2openapi/edit/main/book/{path}"
+default-theme = "ayu"
+preferred-dark-theme = "ayu"
+site-url = "/mitm2openapi/"
+additional-css = ["./mdbook-admonish.css"]
+
+[output.html.fold]
+enable = true
+level = 1
+
+[output.html.search]
+enable = true
+limit-results = 20
+teaser-word-count = 30
+use-boolean-and = true
+boost-title = 2
+boost-hierarchy = 1
+boost-paragraph = 1
+expand = true
+heading-split-level = 3
+
+[output.linkcheck]
+warning-policy = "error"
+follow-web-links = false
+exclude = [
+ # CHANGELOG: [Unreleased], [skip ci], [0.1.0]
+ '^Unreleased$',
+ '^skip ci$',
+ '^\d+\.\d+(\.\d+)?$',
+ # Benchmark table units: Mean [s], Min [s], Max [s]
+ '^s$',
+ # Benchmark workflow path inside included docs/benchmarks.md
+ '\.github/workflows/bench\.yml',
+]
+
+[preprocessor.toc]
+command = "mdbook-toc"
+renderer = ["html"]
+
+[preprocessor.admonish]
+command = "mdbook-admonish"
+assets_version = "3.0.2" # do not edit: managed by `mdbook-admonish install`
+
+[preprocessor.mermaid]
+command = "mdbook-mermaid"
diff --git a/book/mdbook-admonish.css b/book/mdbook-admonish.css
new file mode 100644
index 0000000..45aeff0
--- /dev/null
+++ b/book/mdbook-admonish.css
@@ -0,0 +1,348 @@
+@charset "UTF-8";
+:is(.admonition) {
+ display: flow-root;
+ margin: 1.5625em 0;
+ padding: 0 1.2rem;
+ color: var(--fg);
+ page-break-inside: avoid;
+ background-color: var(--bg);
+ border: 0 solid black;
+ border-inline-start-width: 0.4rem;
+ border-radius: 0.2rem;
+ box-shadow: 0 0.2rem 1rem rgba(0, 0, 0, 0.05), 0 0 0.1rem rgba(0, 0, 0, 0.1);
+}
+@media print {
+ :is(.admonition) {
+ box-shadow: none;
+ }
+}
+:is(.admonition) > * {
+ box-sizing: border-box;
+}
+:is(.admonition) :is(.admonition) {
+ margin-top: 1em;
+ margin-bottom: 1em;
+}
+:is(.admonition) > .tabbed-set:only-child {
+ margin-top: 0;
+}
+html :is(.admonition) > :last-child {
+ margin-bottom: 1.2rem;
+}
+
+a.admonition-anchor-link {
+ display: none;
+ position: absolute;
+ left: -1.2rem;
+ padding-right: 1rem;
+}
+a.admonition-anchor-link:link, a.admonition-anchor-link:visited {
+ color: var(--fg);
+}
+a.admonition-anchor-link:link:hover, a.admonition-anchor-link:visited:hover {
+ text-decoration: none;
+}
+a.admonition-anchor-link::before {
+ content: "§";
+}
+
+:is(.admonition-title, summary.admonition-title) {
+ position: relative;
+ min-height: 4rem;
+ margin-block: 0;
+ margin-inline: -1.6rem -1.2rem;
+ padding-block: 0.8rem;
+ padding-inline: 4.4rem 1.2rem;
+ font-weight: 700;
+ background-color: rgba(68, 138, 255, 0.1);
+ print-color-adjust: exact;
+ -webkit-print-color-adjust: exact;
+ display: flex;
+}
+:is(.admonition-title, summary.admonition-title) p {
+ margin: 0;
+}
+html :is(.admonition-title, summary.admonition-title):last-child {
+ margin-bottom: 0;
+}
+:is(.admonition-title, summary.admonition-title)::before {
+ position: absolute;
+ top: 0.625em;
+ inset-inline-start: 1.6rem;
+ width: 2rem;
+ height: 2rem;
+ background-color: #448aff;
+ print-color-adjust: exact;
+ -webkit-print-color-adjust: exact;
+ mask-image: url('data:image/svg+xml;charset=utf-8,');
+ -webkit-mask-image: url('data:image/svg+xml;charset=utf-8,');
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-size: contain;
+ content: "";
+}
+:is(.admonition-title, summary.admonition-title):hover a.admonition-anchor-link {
+ display: initial;
+}
+
+details.admonition > summary.admonition-title::after {
+ position: absolute;
+ top: 0.625em;
+ inset-inline-end: 1.6rem;
+ height: 2rem;
+ width: 2rem;
+ background-color: currentcolor;
+ mask-image: var(--md-details-icon);
+ -webkit-mask-image: var(--md-details-icon);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-size: contain;
+ content: "";
+ transform: rotate(0deg);
+ transition: transform 0.25s;
+}
+details[open].admonition > summary.admonition-title::after {
+ transform: rotate(90deg);
+}
+
+:root {
+ --md-details-icon: url("data:image/svg+xml;charset=utf-8,");
+}
+
+:root {
+ --md-admonition-icon--admonish-note: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-abstract: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-info: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-tip: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-success: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-question: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-warning: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-failure: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-danger: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-bug: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-example: url("data:image/svg+xml;charset=utf-8,");
+ --md-admonition-icon--admonish-quote: url("data:image/svg+xml;charset=utf-8,");
+}
+
+:is(.admonition):is(.admonish-note) {
+ border-color: #448aff;
+}
+
+:is(.admonish-note) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(68, 138, 255, 0.1);
+}
+:is(.admonish-note) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #448aff;
+ mask-image: var(--md-admonition-icon--admonish-note);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-note);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-abstract, .admonish-summary, .admonish-tldr) {
+ border-color: #00b0ff;
+}
+
+:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(0, 176, 255, 0.1);
+}
+:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #00b0ff;
+ mask-image: var(--md-admonition-icon--admonish-abstract);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-abstract);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-info, .admonish-todo) {
+ border-color: #00b8d4;
+}
+
+:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(0, 184, 212, 0.1);
+}
+:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #00b8d4;
+ mask-image: var(--md-admonition-icon--admonish-info);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-info);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-tip, .admonish-hint, .admonish-important) {
+ border-color: #00bfa5;
+}
+
+:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(0, 191, 165, 0.1);
+}
+:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #00bfa5;
+ mask-image: var(--md-admonition-icon--admonish-tip);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-tip);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-success, .admonish-check, .admonish-done) {
+ border-color: #00c853;
+}
+
+:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(0, 200, 83, 0.1);
+}
+:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #00c853;
+ mask-image: var(--md-admonition-icon--admonish-success);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-success);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-question, .admonish-help, .admonish-faq) {
+ border-color: #64dd17;
+}
+
+:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(100, 221, 23, 0.1);
+}
+:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #64dd17;
+ mask-image: var(--md-admonition-icon--admonish-question);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-question);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-warning, .admonish-caution, .admonish-attention) {
+ border-color: #ff9100;
+}
+
+:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(255, 145, 0, 0.1);
+}
+:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #ff9100;
+ mask-image: var(--md-admonition-icon--admonish-warning);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-warning);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-failure, .admonish-fail, .admonish-missing) {
+ border-color: #ff5252;
+}
+
+:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(255, 82, 82, 0.1);
+}
+:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #ff5252;
+ mask-image: var(--md-admonition-icon--admonish-failure);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-failure);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-danger, .admonish-error) {
+ border-color: #ff1744;
+}
+
+:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(255, 23, 68, 0.1);
+}
+:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #ff1744;
+ mask-image: var(--md-admonition-icon--admonish-danger);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-danger);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-bug) {
+ border-color: #f50057;
+}
+
+:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(245, 0, 87, 0.1);
+}
+:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #f50057;
+ mask-image: var(--md-admonition-icon--admonish-bug);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-bug);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-example) {
+ border-color: #7c4dff;
+}
+
+:is(.admonish-example) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(124, 77, 255, 0.1);
+}
+:is(.admonish-example) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #7c4dff;
+ mask-image: var(--md-admonition-icon--admonish-example);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-example);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+:is(.admonition):is(.admonish-quote, .admonish-cite) {
+ border-color: #9e9e9e;
+}
+
+:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title) {
+ background-color: rgba(158, 158, 158, 0.1);
+}
+:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title)::before {
+ background-color: #9e9e9e;
+ mask-image: var(--md-admonition-icon--admonish-quote);
+ -webkit-mask-image: var(--md-admonition-icon--admonish-quote);
+ mask-repeat: no-repeat;
+ -webkit-mask-repeat: no-repeat;
+ mask-size: contain;
+ -webkit-mask-repeat: no-repeat;
+}
+
+.navy :is(.admonition) {
+ background-color: var(--sidebar-bg);
+}
+
+.ayu :is(.admonition),
+.coal :is(.admonition) {
+ background-color: var(--theme-hover);
+}
+
+.rust :is(.admonition) {
+ background-color: var(--sidebar-bg);
+ color: var(--sidebar-fg);
+}
+.rust .admonition-anchor-link:link, .rust .admonition-anchor-link:visited {
+ color: var(--sidebar-fg);
+}
diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md
new file mode 100644
index 0000000..4acbded
--- /dev/null
+++ b/book/src/SUMMARY.md
@@ -0,0 +1,30 @@
+# Summary
+
+[Introduction](./introduction.md)
+
+# Getting started
+- [Installation](./getting-started/installation.md)
+- [Quick start](./getting-started/quick-start.md)
+- [Capturing traffic](./getting-started/capturing.md)
+
+# Usage
+- [Discover, curate, generate](./usage/pipeline.md)
+- [Filtering endpoints](./usage/filtering.md)
+- [Resource limits](./usage/resource-limits.md)
+- [Strict mode](./usage/strict-mode.md)
+- [Processing reports](./usage/reports.md)
+- [CLI reference](./usage/cli-reference.md)
+
+# Formats
+- [mitmproxy flow dumps](./formats/mitmproxy.md)
+- [HAR files](./formats/har.md)
+
+# Reference
+- [Performance & benchmarks](./reference/benchmarks.md)
+- [Security model](./reference/security.md)
+- [Diagnostics](./reference/diagnostics.md)
+
+---
+
+[Changelog](./changelog.md)
+[Contributing](./contributing.md)
diff --git a/book/src/changelog.md b/book/src/changelog.md
new file mode 100644
index 0000000..9b9ae8b
--- /dev/null
+++ b/book/src/changelog.md
@@ -0,0 +1,7 @@
+# Changelog
+
+{{#include ../../CHANGELOG.md}}
+
+[Unreleased]: https://github.com/Arkptz/mitm2openapi/commits/main
+[skip ci]: #changelog
+[0.1.0]: https://github.com/Arkptz/mitm2openapi/releases/tag/v0.1.0
diff --git a/book/src/contributing.md b/book/src/contributing.md
new file mode 100644
index 0000000..2913f2f
--- /dev/null
+++ b/book/src/contributing.md
@@ -0,0 +1,3 @@
+# Contributing
+
+{{#include ../../CONTRIBUTING.md}}
diff --git a/book/src/formats/har.md b/book/src/formats/har.md
new file mode 100644
index 0000000..8d2ddce
--- /dev/null
+++ b/book/src/formats/har.md
@@ -0,0 +1,77 @@
+# HAR files
+
+`mitm2openapi` reads [HAR (HTTP Archive)](https://w3c.github.io/web-performance/specs/HAR/Overview.html)
+files — the standard format for exporting browser network traffic. HAR version 1.2 is supported.
+
+## Producing HAR files
+
+### Browser DevTools
+
+All modern browsers export HAR from their Network tab:
+
+- **Chrome/Chromium**: DevTools → Network → right-click → "Save all as HAR with content"
+- **Firefox**: DevTools → Network → gear icon → "Save All As HAR"
+- **Safari**: Web Inspector → Network → Export button
+
+### HTTP proxies
+
+Several proxy tools export HAR:
+
+- [Charles Proxy](https://www.charlesproxy.com/) — File → Export Session → HAR
+- [Fiddler](https://www.telerik.com/fiddler) — File → Export Sessions → HTTPArchive
+- [Proxyman](https://proxyman.io/) — Export as HAR
+
+### Programmatic generation
+
+Libraries like [`puppeteer`](https://pptr.dev/) and [`playwright`](https://playwright.dev/)
+can produce HAR files from automated browser sessions:
+
+```javascript
+// Playwright example
+const context = await browser.newContext({
+ recordHar: { path: 'capture.har' }
+});
+// ... run your test
+await context.close(); // HAR is written on close
+```
+
+## Usage
+
+```bash
+mitm2openapi discover \
+ -i capture.har \
+ -o templates.yaml \
+ -p "https://api.example.com"
+```
+
+Format is auto-detected. Use `--format har` to force HAR parsing if auto-detection fails.
+
+## HAR vs mitmproxy flows
+
+| Aspect | mitmproxy flow | HAR |
+|--------|---------------|-----|
+| Source | mitmproxy proxy | Browser DevTools, HTTP proxies |
+| Format | Binary (tnetstring) | JSON |
+| Response bodies | Always present | Sometimes base64-encoded |
+| HTTPS | Decrypted by proxy | Decrypted by browser |
+| File size | Compact binary | Larger (JSON overhead) |
+| Streaming | Native | Incremental JSON parsing |
+
+Both formats produce equivalent OpenAPI specs. Choose based on your capture workflow:
+
+- **mitmproxy flows** for server-side proxying, CI pipelines, and automated captures
+- **HAR files** for browser-based testing, manual exploration, and when you already have DevTools open
+
+## Incremental parsing
+
+HAR files are parsed incrementally — the entire JSON is not loaded into memory at once.
+This means memory usage stays bounded even for large HAR exports (hundreds of megabytes).
+
+## Known limitations
+
+- **Base64-encoded bodies** — some HAR exporters base64-encode response bodies. Decode
+ failures are logged as warnings and the body is skipped (not silently dropped).
+- **Compressed content** — if the HAR exporter did not decompress response bodies,
+ `mitm2openapi` sees the compressed bytes. Most browser DevTools decompress automatically.
+- **Timing data** — HAR timing information (DNS, connect, TLS) is ignored; only request and
+ response data is used for spec generation.
diff --git a/book/src/formats/mitmproxy.md b/book/src/formats/mitmproxy.md
new file mode 100644
index 0000000..46a2b65
--- /dev/null
+++ b/book/src/formats/mitmproxy.md
@@ -0,0 +1,58 @@
+# mitmproxy flow dumps
+
+`mitm2openapi` reads mitmproxy's native binary flow format. This is the recommended input
+format — it captures the richest data and is produced directly by `mitmdump` and `mitmweb`.
+
+## Supported versions
+
+| Flow format version | mitmproxy version | Status |
+|---|---|---|
+| v19 | mitmproxy 8.x | Supported |
+| v20 | mitmproxy 9.x | Supported |
+| v21 | mitmproxy 10.x | Supported |
+
+The flow format is auto-detected from file content. No version flag is needed.
+
+## How flow files work
+
+Flow files use the [tnetstring](https://tnetstrings.info/) serialization format. Each flow
+is a sequence of key-value pairs representing a complete HTTP request-response cycle.
+
+A typical flow contains:
+
+- **Request**: method, URL (scheme, host, port, path), headers, body
+- **Response**: status code, headers, body
+- **Metadata**: timestamps, flow ID, client/server addresses
+
+`mitm2openapi` extracts the request and response data relevant to OpenAPI spec generation
+and discards metadata.
+
+## Capturing flow files
+
+```bash
+# Record all traffic through the proxy
+mitmdump -w capture.flow
+
+# Record only traffic to a specific host
+mitmdump -w capture.flow --set flow_detail=0 \
+ --set save_stream_filter='~d api.example.com'
+```
+
+See [capturing traffic](../getting-started/capturing.md) for full setup instructions.
+
+## Directory input
+
+If you pass a directory path to `-i`, `mitm2openapi` reads all `.flow` files in that
+directory (non-recursive). This is useful when you have traffic split across multiple
+capture sessions.
+
+## Known limitations
+
+- **No WebSocket frames** — WebSocket upgrade requests are captured, but frame-level data
+ is not used for spec generation
+- **No gRPC** — binary protocol buffers inside HTTP/2 frames are not decoded
+- **Corrupt files** — when the tnetstring parser hits corruption, it stops and reports the
+ byte offset. No resync is attempted because binary payloads can contain bytes that mimic
+ valid tnetstring length prefixes. See [diagnostics](../reference/diagnostics.md) for details.
+- **Large payloads** — individual tnetstring payloads are capped at 256 MiB by default
+ (adjustable via `--max-payload-size`)
diff --git a/book/src/getting-started/capturing.md b/book/src/getting-started/capturing.md
new file mode 100644
index 0000000..999ac39
--- /dev/null
+++ b/book/src/getting-started/capturing.md
@@ -0,0 +1,121 @@
+# Capturing traffic
+
+Before you can generate an OpenAPI spec, you need a captured traffic file. This chapter
+covers the most common ways to capture HTTP traffic.
+
+## Option 1: mitmproxy (recommended)
+
+[mitmproxy](https://mitmproxy.org/) is a free, open-source HTTPS proxy. It captures traffic
+in its own binary flow format that `mitm2openapi` reads natively.
+
+### Install mitmproxy
+
+```bash
+# macOS
+brew install mitmproxy
+
+# Linux (pip)
+pip install mitmproxy
+
+# Or download from https://mitmproxy.org/
+```
+
+See the [mitmproxy installation docs](https://docs.mitmproxy.org/stable/overview-installation/)
+for platform-specific instructions.
+
+### Capture with mitmdump
+
+`mitmdump` is the non-interactive version of mitmproxy, ideal for scripted captures:
+
+```bash
+# Start the proxy and write all traffic to a flow file
+mitmdump -w capture.flow
+
+# In another terminal, route your HTTP client through the proxy:
+curl --proxy http://localhost:8080 https://api.example.com/users
+```
+
+The default proxy port is 8080. Use `-p` to change it:
+
+```bash
+mitmdump -w capture.flow -p 9090
+```
+
+### Capture with mitmweb
+
+`mitmweb` provides a browser-based UI for inspecting traffic in real time:
+
+```bash
+mitmweb -w capture.flow
+# Open http://localhost:8081 in your browser to inspect traffic
+```
+
+### HTTPS traffic
+
+For HTTPS, you need to install the mitmproxy CA certificate on the client machine.
+After starting mitmproxy, navigate to `http://mitm.it` from the proxied client to
+download and install the certificate.
+
+See the [mitmproxy certificate docs](https://docs.mitmproxy.org/stable/concepts-certificates/)
+for detailed instructions.
+
+### Tips
+
+- Use `mitmdump --set flow_detail=0` for minimal console output during long captures
+- Combine with `--set save_stream_filter` to capture only specific hosts
+- The flow format is versioned (v19/v20/v21) — `mitm2openapi` supports all three
+
+## Option 2: Browser DevTools (HAR export)
+
+All modern browsers can export captured network traffic as HAR (HTTP Archive) files.
+
+### Chrome / Chromium
+
+1. Open DevTools (`F12` or `Ctrl+Shift+I`)
+2. Switch to the **Network** tab
+3. Ensure recording is active (red circle icon)
+4. Perform the actions you want to capture
+5. Right-click in the request list → **Save all as HAR with content**
+
+### Firefox
+
+1. Open DevTools (`F12`)
+2. Switch to the **Network** tab
+3. Perform the actions you want to capture
+4. Click the gear icon → **Save All As HAR**
+
+### Safari
+
+1. Enable the Develop menu in Preferences → Advanced
+2. Open Web Inspector (`Cmd+Option+I`)
+3. Switch to the **Network** tab
+4. Perform the actions
+5. Click **Export** in the toolbar
+
+```admonish note
+HAR files from browser DevTools contain the full request and response bodies. Sensitive data
+(cookies, tokens, passwords) will be present in the export. Sanitize before sharing.
+```
+
+## Option 3: Other HTTP proxies
+
+Any tool that produces HAR 1.2 output works with `mitm2openapi`:
+
+- [Charles Proxy](https://www.charlesproxy.com/) — export sessions as HAR via File → Export
+- [Fiddler](https://www.telerik.com/fiddler) — File → Export Sessions → HTTPArchive
+- [Proxyman](https://proxyman.io/) — export as HAR from the session menu
+
+## What to capture
+
+For the best OpenAPI spec, capture diverse traffic:
+
+- **Multiple endpoints** — the more paths covered, the more complete the spec
+- **Different HTTP methods** — GET, POST, PUT, DELETE on the same resource
+- **Various response codes** — 200, 400, 404, 500 responses produce richer schemas
+- **Query parameters** — include requests with different query strings
+- **Request bodies** — POST/PUT with different payloads improve body schema inference
+
+## Next steps
+
+Once you have a capture file, proceed to the [quick start](./quick-start.md) or
+learn about the full [discover → curate → generate pipeline](../usage/pipeline.md).
diff --git a/book/src/getting-started/installation.md b/book/src/getting-started/installation.md
new file mode 100644
index 0000000..fc8a69c
--- /dev/null
+++ b/book/src/getting-started/installation.md
@@ -0,0 +1,42 @@
+# Installation
+
+## From binary releases
+
+Download a pre-built binary for your platform from
+[GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases).
+
+Binaries are available for Linux (x86_64, aarch64), macOS (x86_64, aarch64), and
+Windows (x86_64).
+
+```bash
+# Example: Linux x86_64 — replace with the release tag (e.g. v0.5.1)
+curl -L "https://github.com/Arkptz/mitm2openapi/releases/download//mitm2openapi--x86_64-unknown-linux-gnu.tar.gz" \
+ | tar xz
+sudo mv mitm2openapi /usr/local/bin/
+```
+
+## From source (via Cargo)
+
+If you have a Rust toolchain installed:
+
+```bash
+cargo install --git https://github.com/Arkptz/mitm2openapi
+```
+
+Or from [crates.io](https://crates.io/crates/mitm2openapi):
+
+```bash
+cargo install mitm2openapi
+```
+
+## Verify installation
+
+```bash
+mitm2openapi --version
+```
+
+## Shell completions
+
+`mitm2openapi` uses [clap](https://docs.rs/clap) for argument parsing. Shell completions
+are not yet bundled, but you can generate them for most shells via `clap_complete` if building
+from source.
diff --git a/book/src/getting-started/quick-start.md b/book/src/getting-started/quick-start.md
new file mode 100644
index 0000000..1f0db12
--- /dev/null
+++ b/book/src/getting-started/quick-start.md
@@ -0,0 +1,98 @@
+# Quick start
+
+This walkthrough takes you from a traffic capture to a complete OpenAPI spec in under a minute.
+
+## Prerequisites
+
+- `mitm2openapi` installed ([see installation](./installation.md))
+- A captured traffic file — either a mitmproxy `.flow` dump or a `.har` export from browser DevTools
+
+If you do not have a capture yet, see [capturing traffic](./capturing.md) for setup instructions.
+
+## Step 1: Discover endpoints
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com"
+```
+
+This scans every request in `capture.flow` that matches the prefix `https://api.example.com`
+and writes a templates file listing all observed URL paths.
+
+## Step 2: Curate the templates
+
+Open `templates.yaml`. Each path is prefixed with `ignore:` by default:
+
+```yaml
+x-path-templates:
+- ignore:/api/users
+- ignore:/api/users/{id}
+- ignore:/api/products
+- ignore:/static/bundle.js
+```
+
+Remove the `ignore:` prefix from paths you want in the final spec:
+
+```yaml
+x-path-templates:
+- /api/users
+- /api/users/{id}
+- /api/products
+- ignore:/static/bundle.js
+```
+
+Paths still prefixed with `ignore:` are excluded from the generated spec.
+
+## Step 3: Generate the OpenAPI spec
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com"
+```
+
+The resulting `openapi.yaml` contains a valid OpenAPI 3.0 spec with paths, methods,
+parameters, request bodies, and response schemas inferred from the captured traffic.
+
+## Skip the manual edit
+
+If you already know which paths matter, use glob filters to automate curation:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg' \
+ --include-patterns '/api/**,/v2/**'
+
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com"
+```
+
+Paths matching `--include-patterns` are auto-activated (no `ignore:` prefix). Paths matching
+`--exclude-patterns` are dropped entirely. Everything else still gets `ignore:` for manual
+review.
+
+See [filtering endpoints](../usage/filtering.md) for the full glob syntax reference.
+
+## HAR files
+
+The same workflow works with HAR files — just point `-i` at a `.har` file. The format is
+auto-detected:
+
+```bash
+mitm2openapi discover \
+ -i capture.har \
+ -o templates.yaml \
+ -p "https://api.example.com"
+```
+
+See [HAR files](../formats/har.md) for details on exporting HARs from browser DevTools.
diff --git a/book/src/introduction.md b/book/src/introduction.md
new file mode 100644
index 0000000..eb62e18
--- /dev/null
+++ b/book/src/introduction.md
@@ -0,0 +1,51 @@
+# Introduction
+
+**mitm2openapi** converts [mitmproxy](https://mitmproxy.org/) flow dumps and HAR files into
+[OpenAPI 3.0](https://spec.openapis.org/oas/v3.0.3) specifications. It ships as a single
+static binary — no Python, no virtual environment, no runtime dependencies.
+
+It is a Rust rewrite of [mitmproxy2swagger](https://github.com/alufers/mitmproxy2swagger) by
+[@alufers](https://github.com/alufers), who pioneered the "capture traffic, extract API spec"
+workflow. Credit to the original project for the idea and reference implementation.
+
+## Why?
+
+The Python original works well but requires Python, `pip`, and `mitmproxy` installed in the
+environment. For CI pipelines, slim Docker images, security audits, and one-off usage, that
+dependency chain is friction.
+
+`mitm2openapi` ships as a single ~5 MB static binary. Drop it into any environment and run.
+Same OpenAPI 3.0 output, plus first-class HAR support and glob-based filters for fully
+unattended pipelines.
+
+## Features
+
+- **Fast** — pure Rust, ~17× faster than the Python original ([benchmarks](./reference/benchmarks.md))
+- **Single static binary** — no Python, no venv, no pip, no runtime dependencies
+- **Two-format support** — mitmproxy flow dumps (v19/v20/v21) and HAR 1.2
+- **Two-step workflow** — `discover` finds endpoints, you curate, `generate` emits OpenAPI 3.0
+- **Glob filters** — `--exclude-patterns` and `--include-patterns` for automated pipelines
+- **Error recovery** — skips corrupt flows, continues processing
+- **Auto-detection** — heuristic format detection from file content
+- **Resource limits** — configurable caps prevent denial-of-service on untrusted input
+- **Strict mode** — treat warnings as errors for CI gates
+- **Structured reports** — `--report` outputs machine-readable JSON processing summaries
+- **Battle-tested** — integration tests against Swagger Petstore and OWASP crAPI
+- **Cross-platform** — Linux, macOS, Windows pre-built binaries
+
+## How it works
+
+The tool uses a two-step workflow:
+
+1. **Discover** — scan captured traffic and list all observed API endpoints
+2. **Curate** — review the list and select which endpoints to include
+3. **Generate** — produce a clean OpenAPI 3.0 spec from the selected endpoints
+
+This separates endpoint selection from spec generation, giving you full control over
+what ends up in the final spec.
+
+## Next steps
+
+- [Install mitm2openapi](./getting-started/installation.md)
+- [Run through the quick start](./getting-started/quick-start.md)
+- [Learn about the full pipeline](./usage/pipeline.md)
diff --git a/book/src/reference/benchmarks.md b/book/src/reference/benchmarks.md
new file mode 100644
index 0000000..427d527
--- /dev/null
+++ b/book/src/reference/benchmarks.md
@@ -0,0 +1,7 @@
+# Performance & Benchmarks
+
+Results are regenerated weekly by the [benchmark workflow](https://github.com/Arkptz/mitm2openapi/blob/main/.github/workflows/bench.yml). See the workflow for the reproducible methodology.
+
+{{#include ../../../docs/benchmarks.md}}
+
+[s]: #timing
diff --git a/book/src/reference/diagnostics.md b/book/src/reference/diagnostics.md
new file mode 100644
index 0000000..753f3c6
--- /dev/null
+++ b/book/src/reference/diagnostics.md
@@ -0,0 +1,129 @@
+# Diagnostics
+
+
+
+`mitm2openapi` uses structured logging to report issues during processing. This chapter
+covers how to interpret warnings, errors, and the structured report output.
+
+## Log levels
+
+Control verbosity with the `RUST_LOG` environment variable:
+
+```bash
+# Default: warnings only
+mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com"
+
+# More detail
+RUST_LOG=info mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com"
+
+# Full debug output
+RUST_LOG=debug mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com"
+```
+
+## Common warnings
+
+### Parse errors (tnetstring)
+
+```
+WARN TNetString parse error at byte 98304: unexpected end of input (148 flows parsed successfully)
+```
+
+This means the mitmproxy flow file contains corrupt data starting at byte 98,304. The
+parser halts immediately and the remaining bytes in the file are **not** processed. The
+148 flows parsed before the corruption are still emitted.
+
+**No resync is attempted.** Binary payloads can contain bytes that mimic valid tnetstring
+length prefixes, so scanning forward would produce phantom flows with fabricated data.
+
+**What to do:**
+- If the file was truncated during transfer, re-capture or re-download
+- The 148 successfully parsed flows are still usable
+- Use `--report` to capture the exact byte offset for debugging
+
+### Cap-fired events
+
+```
+WARN body size 68157440 exceeds cap 67108864, truncating
+WARN header name exceeds 8192 bytes, dropping
+WARN form field count 1247 exceeds cap 1000, ignoring excess
+```
+
+These indicate that a specific field in a flow exceeded the built-in or configured limit.
+The affected field is truncated or dropped, but processing continues.
+
+**What to do:**
+- Usually safe to ignore — the caps exist to prevent abuse, not normal traffic
+- If you need the full data, increase the relevant `--max-*` flag
+- Use `--strict` to fail on these if you need guaranteed completeness
+
+### Flow rejection events
+
+```
+WARN skipping flow: scheme "javascript" not in whitelist [http, https]
+WARN skipping flow: invalid UTF-8 in host field
+WARN skipping flow: port 0 out of valid range 1-65535
+```
+
+These mean an entire flow was skipped because it failed validation.
+
+**What to do:**
+- Non-HTTP flows (WebSocket upgrades, CONNECT tunnels) are expected to be skipped
+- UTF-8 errors suggest the capture contains binary protocol data, not HTTP traffic
+- Invalid port/status usually indicates corrupt flow data
+
+## Structured reports
+
+For machine-readable diagnostics, use `--report`:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --report report.json
+```
+
+See [processing reports](../usage/reports.md) for the full JSON schema.
+
+### Event categories in reports
+
+| Category | Examples |
+|----------|---------|
+| `parse_error` | Tnetstring corruption, HAR JSON syntax errors |
+| `cap_fired` | Body too large, depth exceeded, form field count exceeded |
+| `rejected` | Invalid scheme, non-UTF-8 identity fields, bad port/status |
+
+### Using reports in CI
+
+```bash
+# Fail if any parse errors occurred
+if jq -e '.events.parse_error | length > 0' report.json > /dev/null 2>&1; then
+ echo "Parse errors detected"
+ exit 1
+fi
+
+# Check flows-read vs flows-emitted ratio
+RATIO=$(jq '.result.flows_emitted / .result.flows_read' report.json)
+if (( $(echo "$RATIO < 0.9" | bc -l) )); then
+ echo "Warning: more than 10% of flows were dropped"
+fi
+```
+
+## Strict mode interaction
+
+With `--strict`, any warning-level event causes exit code 2. This converts the
+"informational" diagnostics above into hard failures:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --strict \
+ --report report.json
+
+# Exit code 2 if ANY warning was emitted
+# report.json still written for post-mortem
+```
+
+See [strict mode](../usage/strict-mode.md) for details.
diff --git a/book/src/reference/security.md b/book/src/reference/security.md
new file mode 100644
index 0000000..7417ef4
--- /dev/null
+++ b/book/src/reference/security.md
@@ -0,0 +1,96 @@
+# Security model
+
+
+
+`mitm2openapi` processes untrusted binary input (traffic captures from unknown sources).
+The security model is designed to prevent denial-of-service, data corruption, and
+information leakage when handling adversarial input.
+
+## Threat model
+
+The primary threat is a **malicious capture file** — a `.flow` or `.har` file crafted to
+exploit the parser. Scenarios include:
+
+- CI pipelines processing captures from untrusted contributors
+- Shared analysis servers where multiple users submit captures
+- Automated pipelines where the capture source is not fully controlled
+
+## Input validation layers
+
+### File-level checks
+
+Before reading any content:
+
+1. **File type** — only regular files are accepted. Symlinks, FIFOs, device files, and
+ directories are rejected unless `--allow-symlinks` is explicitly set.
+2. **File size** — files exceeding `--max-input-size` (default 2 GiB) are rejected before
+ any bytes are read.
+3. **TOCTOU caveat** — file metadata is checked via the path before reading to reject
+ symlinks, non-regular files, and oversized inputs. There is a small TOCTOU window
+ between the metadata check and the file open; mitigation via fd-based recheck after
+ open is a future enhancement.
+
+### Parser-level caps
+
+During parsing:
+
+| Cap | Default | Purpose |
+|-----|---------|---------|
+| Payload size | 256 MiB | Prevents OOM from oversized tnetstring values |
+| Nesting depth | 256 | Prevents stack overflow from deeply nested structures |
+| JSON depth | 64 | Prevents stack overflow in schema inference |
+| Body size | 64 MiB | Limits memory for individual request/response bodies |
+
+These caps trigger `warn`-level events and skip the affected data. Use `--strict` to
+treat them as hard errors.
+
+### Field-level validation
+
+For every flow:
+
+- **Scheme whitelist** — only `http` and `https` are accepted. Other schemes (e.g.,
+ `javascript:`, `data:`) are silently skipped.
+- **UTF-8 strictness** — identity fields (method, scheme, host, path, header names) must be
+ valid UTF-8. Invalid bytes cause the flow to be skipped, preventing data aliasing through
+ replacement-character collisions.
+- **Port range** — port numbers must be 1--65,535. Out-of-range values drop the request.
+- **Status code range** — HTTP status codes must be 100--599.
+- **Control character stripping** — `0x00`--`0x1F` and `0x7F` in URL paths are removed.
+- **Header caps** — header names over 8 KiB are dropped; values over 64 KiB are truncated.
+- **Form field count** — at most 1,000 form fields per request are processed.
+
+### Output safety
+
+- **Atomic writes** — output files are written via a temporary file and renamed. If the write
+ fails (disk full, permission denied), the target path is left untouched.
+- **No resync on corruption** — when the tnetstring parser encounters corrupt data, it halts
+ immediately. It does not scan forward looking for the next valid frame, because binary
+ payloads can contain bytes that look like valid length prefixes.
+
+## Streaming architecture
+
+Both mitmproxy and HAR inputs are processed incrementally. At no point is the entire capture
+loaded into memory. This bounds peak RSS to the size of the largest single flow, regardless
+of total file size.
+
+## Glob pattern safety
+
+The `--exclude-patterns` and `--include-patterns` flags use the
+[globset](https://docs.rs/globset) crate, which compiles patterns into a DFA. This eliminates
+exponential backtracking that was possible with the original recursive glob matcher.
+
+## Recommendations
+
+For processing untrusted captures:
+
+1. Do not use `--allow-symlinks` unless you control the filesystem
+2. Keep `--max-input-size` at the default (2 GiB) or lower
+3. Run with `--strict` to fail fast on any anomaly
+4. Use `--report` to capture processing diagnostics for audit trails
+5. Run in a sandboxed environment (container, VM) when processing captures from unknown sources
+
+## Related
+
+- [Resource limits](../usage/resource-limits.md) — configuring the caps
+- [Strict mode](../usage/strict-mode.md) — CI enforcement
+- [Diagnostics](./diagnostics.md) — interpreting warnings and errors
diff --git a/book/src/usage/cli-reference.md b/book/src/usage/cli-reference.md
new file mode 100644
index 0000000..5b1ca06
--- /dev/null
+++ b/book/src/usage/cli-reference.md
@@ -0,0 +1,120 @@
+# CLI reference
+
+
+
+```admonish warning
+This reference was last synced with `mitm2openapi --help` at version 0.5.1.
+If you notice a flag missing from your local `--help` output, the tool may be ahead of these
+docs. [Open an issue](https://github.com/Arkptz/mitm2openapi/issues/new) to prompt an update.
+```
+
+## `mitm2openapi discover`
+
+Scan captured traffic and produce a templates file listing all observed endpoints.
+
+```
+mitm2openapi discover [OPTIONS] -i -o -p
+```
+
+### Required arguments
+
+| Option | Description |
+|--------|-------------|
+| `-i, --input ` | Input file (flow dump or HAR) |
+| `-o, --output ` | Output YAML templates file |
+| `-p, --prefix ` | API prefix URL to filter requests |
+
+### Optional arguments
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--format ` | `auto` | Input format: `auto`, `har`, `mitmproxy` |
+| `--exclude-patterns ` | | Comma-separated globs; matching paths are dropped entirely |
+| `--include-patterns ` | | Comma-separated globs; matching paths are auto-activated |
+| `--max-input-size ` | `2GiB` | Maximum input file size. Accepts `KiB`, `MiB`, `GiB` suffixes |
+| `--allow-symlinks` | off | Allow symlinked input files |
+| `--strict` | off | Treat warnings as errors (exit code 2) |
+| `--report ` | | Write structured JSON processing report |
+
+## `mitm2openapi generate`
+
+Generate an OpenAPI 3.0 spec from captured traffic using a curated templates file.
+
+```
+mitm2openapi generate [OPTIONS] -i -t -o -p
+```
+
+### Required arguments
+
+| Option | Description |
+|--------|-------------|
+| `-i, --input ` | Input file (flow dump or HAR) |
+| `-t, --templates ` | Templates YAML file (from `discover`) |
+| `-o, --output ` | Output OpenAPI YAML file |
+| `-p, --prefix ` | API prefix URL |
+
+### Optional arguments
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--format ` | `auto` | Input format: `auto`, `har`, `mitmproxy` |
+| `--openapi-title ` | | Custom title for the spec |
+| `--openapi-version ` | `1.0.0` | Custom spec version |
+| `--exclude-headers ` | | Comma-separated headers to exclude from spec |
+| `--exclude-cookies ` | | Comma-separated cookies to exclude from spec |
+| `--include-headers` | off | Include request headers in the spec |
+| `--ignore-images` | off | Ignore image content types |
+| `--suppress-params` | off | Suppress parameter suggestions |
+| `--tags-overrides ` | | JSON string for tag overrides |
+| `--max-input-size ` | `2GiB` | Maximum input file size |
+| `--max-payload-size ` | `256MiB` | Maximum tnetstring payload size |
+| `--max-depth ` | `256` | Maximum tnetstring nesting depth |
+| `--max-body-size ` | `64MiB` | Maximum request/response body size |
+| `--allow-symlinks` | off | Allow symlinked input files |
+| `--strict` | off | Treat warnings as errors (exit code 2) |
+| `--report ` | | Write structured JSON processing report |
+
+## Common flag details
+
+### `--format`
+
+By default, the input format is auto-detected from a combination of file extension and
+content sniffing:
+- `.flow` extension or content starting with a tnetstring length prefix → mitmproxy format
+- `.har` extension or content starting with `{` → HAR format
+
+Use `--format mitmproxy` or `--format har` to override auto-detection.
+
+### `--prefix`
+
+The prefix URL filters which requests are processed. Only requests whose URL starts with
+the prefix are included. The prefix is stripped from paths in the generated spec.
+
+Example: with `--prefix https://api.example.com`, a request to
+`https://api.example.com/users/42` produces path `/users/42` in the spec.
+
+### `--strict`
+
+See [strict mode](./strict-mode.md) for details on exit codes and CI usage.
+
+### `--report`
+
+See [processing reports](./reports.md) for the JSON schema and CI integration examples.
+
+## Exit codes
+
+| Code | Meaning |
+|------|---------|
+| 0 | Success |
+| 1 | Fatal error (I/O failure, missing arguments, invalid input) |
+| 2 | Strict mode violation (warnings with `--strict` enabled) |
+
+## Environment variables
+
+| Variable | Description |
+|----------|-------------|
+| `RUST_LOG` | Controls log verbosity. Default: `warn`. Set to `info` or `debug` for more output. |
+
+```bash
+RUST_LOG=info mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com"
+```
diff --git a/book/src/usage/filtering.md b/book/src/usage/filtering.md
new file mode 100644
index 0000000..266b76e
--- /dev/null
+++ b/book/src/usage/filtering.md
@@ -0,0 +1,96 @@
+# Filtering endpoints
+
+
+
+The `discover` command supports glob-based filters to automate endpoint curation.
+This is useful for CI pipelines or large captures where manual editing is impractical.
+
+## Glob syntax
+
+Filters use git-style glob patterns (powered by the [`globset`](https://docs.rs/globset) crate):
+
+| Pattern | Matches | Does not match |
+|---------|---------|----------------|
+| `*` | Single path segment | Segments with `/` |
+| `**` | Any number of path segments | (matches everything) |
+| `?` | Any single character | |
+| `[abc]` | Character class | |
+| `{a,b}` | Alternation | |
+
+## `--exclude-patterns`
+
+Paths matching any exclude glob are **dropped entirely** — they do not appear in the
+templates file at all.
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg,*.png'
+```
+
+Multiple patterns are comma-separated. A path is excluded if it matches **any** pattern.
+
+## `--include-patterns`
+
+Paths matching any include glob are emitted **without the `ignore:` prefix** — they are
+auto-activated for the `generate` step.
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --include-patterns '/api/**,/v2/**'
+```
+
+## Combining filters
+
+When both are specified:
+
+1. **Exclude runs first** — matching paths are dropped entirely
+2. **Include runs second** — matching paths among the survivors are auto-activated
+3. **Everything else** gets the `ignore:` prefix for manual review
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --exclude-patterns '/static/**,*.css,*.js' \
+ --include-patterns '/api/**'
+```
+
+Result:
+- `/static/bundle.js` — excluded (dropped)
+- `/api/users` — included (auto-activated)
+- `/dashboard` — neither matched (gets `ignore:` prefix)
+
+## Examples
+
+### API-only spec
+
+```bash
+--include-patterns '/api/**' \
+--exclude-patterns '/api/internal/**,/api/debug/**'
+```
+
+### Strip static assets
+
+```bash
+--exclude-patterns '/static/**,/assets/**,*.css,*.js,*.svg,*.png,*.jpg,*.gif,*.ico,*.woff,*.woff2'
+```
+
+### Multiple API versions
+
+```bash
+--include-patterns '/v1/**,/v2/**,/v3/**'
+```
+
+## Pattern tips
+
+- Patterns match against the **URL path only** (after the prefix is stripped), not the full URL
+- Leading `/` is recommended for clarity but not required
+- Patterns are case-sensitive
+- Use `**` sparingly — it matches everything, including deeply nested paths
diff --git a/book/src/usage/pipeline.md b/book/src/usage/pipeline.md
new file mode 100644
index 0000000..a50b0bd
--- /dev/null
+++ b/book/src/usage/pipeline.md
@@ -0,0 +1,200 @@
+# Discover, curate, generate
+
+
+
+`mitm2openapi` uses a three-step pipeline to convert captured HTTP traffic into an OpenAPI
+specification. This chapter explains each step in detail.
+
+## Overview
+
+```mermaid
+graph LR
+ A[Traffic capture] --> B[discover]
+ B --> C[Templates file]
+ C --> D[Curate]
+ D --> E[generate]
+ E --> F[OpenAPI 3.0 spec]
+```
+
+The pipeline separates **endpoint discovery** from **spec generation**, giving you an explicit
+curation step where you choose which endpoints appear in the final spec.
+
+## Step 1: Discover
+
+The `discover` command scans a traffic capture and extracts all unique URL paths that match
+a given prefix.
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com"
+```
+
+### What happens internally
+
+1. The input file is read incrementally (streaming — memory usage stays bounded)
+2. Each request's URL is checked against the `--prefix` filter
+3. Matching paths are collected and deduplicated
+4. Path segments that look like IDs (UUIDs, numeric strings) are replaced with
+ `{id}` placeholders (or `{id1}`, `{id2}`, ... when a path has multiple parameters)
+5. The result is written to the templates file
+
+### Templates file format
+
+The output is a YAML file with path templates under an `x-path-templates` key:
+
+```yaml
+x-path-templates:
+- ignore:/api/users
+- ignore:/api/users/{id}
+- ignore:/api/products
+- ignore:/api/products/{id}/reviews
+- ignore:/static/bundle.js
+```
+
+Every path is prefixed with `ignore:` by default. This is intentional — it forces you to
+explicitly opt in to each endpoint.
+
+### Automatic parameterization
+
+The discover step detects path segments that vary across requests and replaces them with
+named parameters:
+
+| Observed paths | Template |
+|---|---|
+| `/api/users/42`, `/api/users/99` | `/api/users/{id}` |
+| `/api/orders/abc-def-123` | `/api/orders/{id}` |
+
+UUID-like and numeric segments are detected automatically. More complex patterns require
+manual editing of the templates file.
+
+## Step 2: Curate
+
+Open the templates file in any text editor. For each path:
+
+- **Remove `ignore:`** to include the endpoint in the generated spec
+- **Leave `ignore:`** to exclude it
+- **Delete the line** to exclude it permanently
+
+```yaml
+# Before curation
+x-path-templates:
+- ignore:/api/users
+- ignore:/api/users/{id}
+- ignore:/static/bundle.js
+
+# After curation
+x-path-templates:
+- /api/users
+- /api/users/{id}
+- ignore:/static/bundle.js
+```
+
+You can also edit parameter names. The default `{id}` placeholder can be renamed to
+something more descriptive like `{userId}`:
+
+```yaml
+- /api/users/{userId}
+```
+
+### Automating curation with glob filters
+
+For CI pipelines or large captures, manual curation is impractical. Use `--include-patterns`
+and `--exclude-patterns` during the `discover` step instead:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --include-patterns '/api/**' \
+ --exclude-patterns '/static/**,*.css,*.js'
+```
+
+Paths matching `--include-patterns` are emitted without the `ignore:` prefix (auto-activated).
+Paths matching `--exclude-patterns` are dropped entirely. Everything else gets `ignore:` for
+manual review.
+
+See [filtering endpoints](./filtering.md) for the full glob syntax.
+
+## Step 3: Generate
+
+The `generate` command re-reads the traffic capture and produces an OpenAPI spec using the
+curated templates as a guide:
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com"
+```
+
+### What happens internally
+
+1. The templates file is loaded and the `ignore:` entries are filtered out
+2. Each template path is compiled into a regex for matching
+3. The traffic capture is streamed again, matching each request against the templates
+4. For each matched request:
+ - Path parameters are extracted
+ - Query parameters are collected
+ - Request body schema is inferred (JSON, form data)
+ - Response status code and body schema are recorded
+5. When multiple requests match the same template, their schemas are merged:
+ - Different status codes (200, 400, 404) produce separate response entries
+ - Request body is taken from the first observation; subsequent same-endpoint
+ observations only contribute response schemas
+6. The final OpenAPI 3.0 document is written as YAML
+
+### Customizing output
+
+The `generate` command accepts several options to tune the output:
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com" \
+ --openapi-title "My API" \
+ --openapi-version "2.0.0" \
+ --exclude-headers "authorization,cookie" \
+ --ignore-images
+```
+
+See the [CLI reference](./cli-reference.md) for all available options.
+
+## Worked example
+
+Starting from a mitmproxy capture of a pet store API:
+
+```bash
+# Discover all endpoints under the API prefix
+mitm2openapi discover \
+ -i petstore.flow \
+ -o templates.yaml \
+ -p "http://petstore:8080" \
+ --exclude-patterns '/static/**' \
+ --include-patterns '/api/**'
+
+# Templates file now has API paths auto-activated:
+# - /api/v3/pet
+# - /api/v3/pet/{id}
+# - /api/v3/pet/findByStatus
+# - /api/v3/store/inventory
+# - ignore:/static/swagger-ui.css
+
+# Generate the spec
+mitm2openapi generate \
+ -i petstore.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "http://petstore:8080"
+
+# Result: openapi.yaml with paths, methods, schemas
+```
+
+The generated `openapi.yaml` is a valid OpenAPI 3.0 document that can be opened in
+[Swagger UI](https://github.com/swagger-api/swagger-ui), imported into Postman, or used
+as a contract for API testing.
diff --git a/book/src/usage/reports.md b/book/src/usage/reports.md
new file mode 100644
index 0000000..15a3bab
--- /dev/null
+++ b/book/src/usage/reports.md
@@ -0,0 +1,98 @@
+# Processing reports
+
+Pass `--report ` to either `discover` or `generate` to write a JSON processing
+summary. This is useful for CI pipelines that need structured data instead of log scraping.
+
+## Usage
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --report report.json
+```
+
+## Report schema
+
+```json
+{
+ "report_version": 1,
+ "tool_version": "0.5.1",
+ "input": {
+ "path": "capture.flow",
+ "format": "Auto",
+ "size_bytes": 102400
+ },
+ "result": {
+ "flows_read": 150,
+ "flows_emitted": 148,
+ "paths_in_spec": 12
+ },
+ "events": {
+ "parse_error": {
+ "TNetString parse error at byte 98304: unexpected end of input": 1
+ }
+ }
+}
+```
+
+### Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `report_version` | integer | Schema version (currently `1`) |
+| `tool_version` | string | `mitm2openapi` version that produced the report |
+| `input.path` | string | Input file path |
+| `input.format` | string | Detected or specified format (`Auto`, `Mitmproxy`, `Har`) |
+| `input.size_bytes` | integer | Input file size in bytes |
+| `result.flows_read` | integer | Total flows/entries parsed from input |
+| `result.flows_emitted` | integer | Flows that passed all filters and were processed |
+| `result.paths_in_spec` | integer | Unique paths in the output (for `generate`) |
+| `events` | object | Map of event categories to message counts |
+
+### Event categories
+
+| Category | Meaning | Status |
+|----------|---------|--------|
+| `parse_error` | Corrupt data encountered (tnetstring errors, malformed HAR entries) | Populated |
+| `cap_fired` | A resource limit was triggered (body too large, depth exceeded) | Reserved — not yet populated at runtime |
+| `rejected` | A flow was skipped (invalid UTF-8, unsupported scheme, bad port/status) | Reserved — not yet populated at runtime |
+
+The `cap_fired` and `rejected` categories are present in the report schema and will be
+connected to the reader pipelines in a future release. Currently, only `parse_error`
+events are counted.
+
+## CI integration
+
+Parse the report in CI to make decisions based on processing quality:
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com" \
+ --report report.json
+
+# Check if any events occurred
+if jq -e '.events | length > 0' report.json > /dev/null 2>&1; then
+ echo "Warning: processing had events"
+ jq '.events' report.json
+fi
+```
+
+## Report with strict mode
+
+The report is written even when `--strict` causes a non-zero exit code. This lets you
+capture full diagnostics while still failing the CI job:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --strict \
+ --report report.json \
+ || { jq '.' report.json; exit 1; }
+```
diff --git a/book/src/usage/resource-limits.md b/book/src/usage/resource-limits.md
new file mode 100644
index 0000000..e11985e
--- /dev/null
+++ b/book/src/usage/resource-limits.md
@@ -0,0 +1,91 @@
+# Resource limits
+
+
+
+To prevent denial-of-service when processing untrusted captures, `mitm2openapi` enforces
+several configurable and fixed limits.
+
+## Configurable limits
+
+These limits can be adjusted via CLI flags:
+
+| Flag | Default | Purpose |
+|------|---------|---------|
+| `--max-input-size` | 2 GiB | Reject files larger than this before reading |
+| `--max-payload-size` | 256 MiB | Cap on individual tnetstring payload allocation |
+| `--max-depth` | 256 | Recursion depth limit for nested tnetstring structures |
+| `--max-body-size` | 64 MiB | Maximum request/response body considered during schema inference |
+| `--allow-symlinks` | off | By default, symlinked inputs are rejected |
+
+### Adjusting limits
+
+Increase `--max-input-size` if you work with captures larger than 2 GiB:
+
+```bash
+mitm2openapi discover \
+ -i large-capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --max-input-size 8GiB
+```
+
+Size suffixes are supported: `KiB`, `MiB`, `GiB`.
+
+The other limits rarely need tuning. The defaults are designed to handle real-world
+captures while rejecting pathological inputs.
+
+### Symlink rejection
+
+By default, symlinked input files are rejected to prevent path-traversal attacks on shared
+CI runners. If you need to process a symlinked file:
+
+```bash
+mitm2openapi discover \
+ -i /path/to/symlinked-capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --allow-symlinks
+```
+
+## Fixed per-field limits
+
+These limits are applied unconditionally and cannot be changed via CLI flags:
+
+| Field | Cap | Behaviour when exceeded |
+|-------|-----|------------------------|
+| Header name | 8 KiB | Header dropped (other headers still processed) |
+| Header value | 64 KiB | Value truncated to cap |
+| Form fields per request | 1,000 | Excess fields ignored |
+| URL scheme | `http` / `https` only | Non-HTTP flows silently skipped |
+| Port number | 1 -- 65,535 | Out-of-range port drops the request |
+| HTTP status code | 100 -- 599 | Invalid codes treated as no response |
+
+## UTF-8 validation
+
+Identity fields (scheme, host, path, method, header names) require valid UTF-8. Flows
+with non-UTF-8 identity bytes are skipped to prevent data aliasing through
+replacement-character collisions.
+
+Control characters (`0x00`--`0x1F`, `0x7F`) in paths are stripped automatically.
+
+## Streaming and memory
+
+Both mitmproxy flow files and HAR files are processed incrementally. Memory usage stays
+bounded regardless of input size — there is no need to load the entire capture into memory.
+
+Peak RSS is proportional to the size of the **largest single flow** in the capture, not the
+total file size. For typical captures, expect 5--15 MB of memory usage.
+
+## When limits fire
+
+When a per-field limit is exceeded (header too large, body too large, form fields over cap),
+the affected field is skipped or truncated and processing continues with the remaining data.
+
+When a tnetstring parse error occurs, the iterator halts and the rest of the file is not
+processed — valid flows parsed before the error are still emitted. There is no resync
+because binary payloads can contain bytes that mimic valid length prefixes.
+
+In both cases a `warn`-level log message is emitted with details.
+
+Use [strict mode](./strict-mode.md) to treat these warnings as errors, or
+[processing reports](./reports.md) to capture them as structured data.
diff --git a/book/src/usage/strict-mode.md b/book/src/usage/strict-mode.md
new file mode 100644
index 0000000..475c3fd
--- /dev/null
+++ b/book/src/usage/strict-mode.md
@@ -0,0 +1,79 @@
+# Strict mode
+
+Pass `--strict` to either `discover` or `generate` to treat warning-level events as
+hard failures. The process exits with code **2** if the processing report records any
+counted events.
+
+Currently, the only event counter populated at runtime is `parse_error` — triggered when
+flows cannot be deserialized (corrupt tnetstring data, malformed HAR JSON). The
+`cap_fired` and `rejected` counters exist in the report schema but are not yet wired to
+the reader pipelines; they will be connected in a future release.
+
+In practice, `--strict` today catches:
+
+- Parse errors during flow deserialization (tnetstring or HAR)
+- Errors counted by the streaming iterator wrapper in `discover` mode
+
+## Usage
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --strict
+```
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com" \
+ --strict
+```
+
+## CI usage pattern
+
+Strict mode is designed for CI gates where silent degradation is unacceptable:
+
+```bash
+mitm2openapi discover \
+ -i capture.flow \
+ -o templates.yaml \
+ -p "https://api.example.com" \
+ --strict \
+ || { echo "FAIL: corrupt or over-limit flows detected"; exit 1; }
+```
+
+## Without `--strict`
+
+Without the flag, parse errors are logged at `warn` level and processing continues with
+exit code 0. Affected flows are skipped, but the output file is still produced. Other
+warning-level events (cap fires, scheme rejections, etc.) are always logged but do not
+currently increment the report counters that `--strict` checks.
+
+## Exit codes
+
+| Code | Meaning |
+|------|---------|
+| 0 | Success (no warnings, or `--strict` not set) |
+| 1 | Fatal error (I/O failure, missing required arguments) |
+| 2 | Strict mode violation (warnings detected with `--strict`) |
+
+## Combining with reports
+
+For CI pipelines that need both strict enforcement and structured diagnostics:
+
+```bash
+mitm2openapi generate \
+ -i capture.flow \
+ -t templates.yaml \
+ -o openapi.yaml \
+ -p "https://api.example.com" \
+ --strict \
+ --report report.json
+```
+
+The [report](./reports.md) is written even when `--strict` causes a non-zero exit, capturing
+the full details of what went wrong.
diff --git a/docs/benchmarks.md b/docs/benchmarks.md
index 1a53a08..7830a65 100644
--- a/docs/benchmarks.md
+++ b/docs/benchmarks.md
@@ -1,8 +1,6 @@
-# Benchmarks
-
Generated by the [benchmark workflow](.github/workflows/bench.yml).
-# Benchmark results
+## Benchmark results
_Run: 2026-04-22 22:31 UTC, commit `22ef2faa`, runner: Linux 6.17.0-1011-azure_
diff --git a/src/har_reader.rs b/src/har_reader.rs
index 64d5f30..980209f 100644
--- a/src/har_reader.rs
+++ b/src/har_reader.rs
@@ -10,6 +10,9 @@ use crate::error::{Error, Result};
use crate::types::CapturedRequest;
use crate::MAX_BODY_SIZE;
+const MAX_HEADER_NAME_SIZE: usize = 8 * 1024;
+const MAX_HEADER_VALUE_SIZE: usize = 64 * 1024;
+
#[derive(Deserialize)]
struct StreamingHarEntry {
request: StreamingHarRequest,
@@ -112,27 +115,50 @@ impl HarFlowWrapper {
Some(Self {
url: entry.request.url,
method: entry.request.method,
- request_headers: entry
- .request
- .headers
- .into_iter()
- .map(|h| (h.name, h.value))
- .collect(),
+ request_headers: cap_headers(entry.request.headers),
request_body,
response_status,
response_reason: entry.response.status_text,
- response_headers: entry
- .response
- .headers
- .into_iter()
- .map(|h| (h.name, h.value))
- .collect(),
+ response_headers: cap_headers(entry.response.headers),
response_body,
response_content_type,
})
}
}
+fn cap_headers(headers: Vec) -> Vec<(String, String)> {
+ headers
+ .into_iter()
+ .filter_map(|h| {
+ if h.name.len() > MAX_HEADER_NAME_SIZE {
+ warn!(
+ event = "header_name_too_large",
+ size = h.name.len(),
+ max = MAX_HEADER_NAME_SIZE,
+ "dropping HAR header with oversized name"
+ );
+ return None;
+ }
+ let value = if h.value.len() > MAX_HEADER_VALUE_SIZE {
+ warn!(
+ event = "header_value_too_large",
+ size = h.value.len(),
+ max = MAX_HEADER_VALUE_SIZE,
+ name = %h.name,
+ "truncating oversized HAR header value"
+ );
+ h.value
+ .get(..MAX_HEADER_VALUE_SIZE)
+ .unwrap_or(&h.value)
+ .to_string()
+ } else {
+ h.value
+ };
+ Some((h.name, value))
+ })
+ .collect()
+}
+
fn cap_body(body: Vec) -> Vec {
if body.len() > MAX_BODY_SIZE {
warn!(
@@ -432,6 +458,14 @@ pub fn stream_har_file(path: &Path) -> Result {
}
fn stream_har_dir(path: &Path) -> Result {
+ stream_har_dir_inner(path, false)
+}
+
+pub fn stream_har_dir_no_symlinks(path: &Path) -> Result {
+ stream_har_dir_inner(path, true)
+}
+
+fn stream_har_dir_inner(path: &Path, reject_symlinks: bool) -> Result {
let mut dir_entries: Vec<_> = std::fs::read_dir(path)?
.filter_map(|e| match e {
Ok(entry) => Some(entry),
@@ -449,6 +483,23 @@ fn stream_har_dir(path: &Path) -> Result {
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("har"))
})
+ .filter(|e| {
+ if reject_symlinks {
+ match e.path().symlink_metadata() {
+ Ok(meta) if meta.file_type().is_symlink() => {
+ warn!(
+ event = "symlink_rejected",
+ path = %e.path().display(),
+ "skipping symlinked HAR directory entry"
+ );
+ false
+ }
+ _ => true,
+ }
+ } else {
+ true
+ }
+ })
.collect();
dir_entries.sort_by_key(|e| e.path());
diff --git a/src/main.rs b/src/main.rs
index d4e80f1..a607332 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -274,16 +274,32 @@ fn stream_input(
max_input_size: u64,
allow_symlinks: bool,
) -> Result {
+ // Check symlink-ness before is_dir(), since is_dir() follows symlinks.
+ if !allow_symlinks {
+ if let Ok(meta) = path.symlink_metadata() {
+ if meta.file_type().is_symlink() {
+ return Err(mitm2openapi::error::Error::SymlinkRejected {
+ path: path.to_path_buf(),
+ }
+ .into());
+ }
+ }
+ }
if !path.is_dir() {
mitm2openapi::validate_input_path(path, max_input_size, allow_symlinks)
.context("input file validation failed")?;
}
+ let reject_symlinks = !allow_symlinks;
match format {
InputFormat::Mitmproxy => {
debug!(path = %path.display(), "Streaming as mitmproxy format");
if path.is_dir() {
- mitmproxy_reader::stream_mitmproxy_dir(path)
- .context("failed to stream mitmproxy directory")
+ if reject_symlinks {
+ mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(path)
+ } else {
+ mitmproxy_reader::stream_mitmproxy_dir(path)
+ }
+ .context("failed to stream mitmproxy directory")
} else {
let iter = mitmproxy_reader::stream_mitmproxy_file(path)
.context("failed to stream mitmproxy file")?;
@@ -298,8 +314,16 @@ fn stream_input(
InputFormat::Auto => {
if path.is_dir() {
debug!(path = %path.display(), "Auto-detecting format for directory");
- let mitmproxy_result = mitmproxy_reader::stream_mitmproxy_dir(path);
- let har_result = har_reader::stream_har_file(path);
+ let mitmproxy_result = if reject_symlinks {
+ mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(path)
+ } else {
+ mitmproxy_reader::stream_mitmproxy_dir(path)
+ };
+ let har_result = if reject_symlinks {
+ har_reader::stream_har_dir_no_symlinks(path)
+ } else {
+ har_reader::stream_har_file(path)
+ };
match (mitmproxy_result, har_result) {
(Ok(m_iter), Ok(h_iter)) => {
diff --git a/src/mitmproxy_reader.rs b/src/mitmproxy_reader.rs
index 7076e67..c0cc7d6 100644
--- a/src/mitmproxy_reader.rs
+++ b/src/mitmproxy_reader.rs
@@ -361,6 +361,14 @@ pub fn stream_mitmproxy_file(
}
pub fn stream_mitmproxy_dir(path: &Path) -> Result {
+ stream_mitmproxy_dir_inner(path, false)
+}
+
+pub fn stream_mitmproxy_dir_no_symlinks(path: &Path) -> Result {
+ stream_mitmproxy_dir_inner(path, true)
+}
+
+fn stream_mitmproxy_dir_inner(path: &Path, reject_symlinks: bool) -> Result {
let mut entries: Vec<_> = std::fs::read_dir(path)?
.filter_map(|e| match e {
Ok(entry) => Some(entry),
@@ -378,6 +386,23 @@ pub fn stream_mitmproxy_dir(path: &Path) -> Result {
.extension()
.is_some_and(|ext| ext.eq_ignore_ascii_case("flow"))
})
+ .filter(|e| {
+ if reject_symlinks {
+ match e.path().symlink_metadata() {
+ Ok(meta) if meta.file_type().is_symlink() => {
+ warn!(
+ event = "symlink_rejected",
+ path = %e.path().display(),
+ "skipping symlinked directory entry"
+ );
+ false
+ }
+ _ => true,
+ }
+ } else {
+ true
+ }
+ })
.collect();
entries.sort_by_key(|e| e.path());
diff --git a/tests/security.rs b/tests/security.rs
index 1d9248c..a897b19 100644
--- a/tests/security.rs
+++ b/tests/security.rs
@@ -79,3 +79,87 @@ fn normal_file_passes_validation() {
let result = mitm2openapi::validate_input_path(&path, mitm2openapi::MAX_INPUT_SIZE, false);
assert!(result.is_ok(), "normal file should pass: {result:?}");
}
+
+#[cfg(unix)]
+#[test]
+fn symlink_to_directory_rejected() {
+ use std::os::unix::fs as unix_fs;
+
+ let dir = TempDir::new().unwrap();
+ let real_dir = dir.path().join("real_dir");
+ std::fs::create_dir(&real_dir).unwrap();
+ std::fs::write(real_dir.join("test.flow"), b"1:X,").unwrap();
+
+ let link = dir.path().join("link_dir");
+ unix_fs::symlink(&real_dir, &link).unwrap();
+
+ assert!(link.is_dir(), "symlink should resolve to directory");
+
+ let err = mitm2openapi::validate_input_path(&link, mitm2openapi::MAX_INPUT_SIZE, false);
+ assert!(
+ matches!(err, Err(mitm2openapi::error::Error::SymlinkRejected { .. })),
+ "symlink to directory should be rejected, got {err:?}"
+ );
+}
+
+#[cfg(unix)]
+#[test]
+fn symlink_dir_entry_rejected_in_mitmproxy() {
+ use std::os::unix::fs as unix_fs;
+
+ let dir = TempDir::new().unwrap();
+ let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+ .join("testdata")
+ .join("flows")
+ .join("simple_get.flow");
+ let real_file = dir.path().join("real.flow");
+ std::fs::copy(&src, &real_file).unwrap();
+
+ let link_file = dir.path().join("linked.flow");
+ unix_fs::symlink(&real_file, &link_file).unwrap();
+
+ let iter = mitm2openapi::mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(dir.path());
+ assert!(iter.is_ok(), "should open directory");
+ let results: Vec<_> = iter.unwrap().filter_map(|r| r.ok()).collect();
+
+ assert!(
+ !results.is_empty(),
+ "real file should produce at least one flow"
+ );
+
+ let all_results: Vec<_> = mitm2openapi::mitmproxy_reader::stream_mitmproxy_dir(dir.path())
+ .unwrap()
+ .filter_map(|r| r.ok())
+ .collect();
+ assert!(
+ all_results.len() > results.len(),
+ "without symlink rejection, both files should be processed"
+ );
+}
+
+#[cfg(unix)]
+#[test]
+fn symlink_dir_entry_rejected_in_har() {
+ use std::os::unix::fs as unix_fs;
+
+ let dir = TempDir::new().unwrap();
+ let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+ .join("testdata")
+ .join("har")
+ .join("simple.har");
+ let real_file = dir.path().join("real.har");
+ std::fs::copy(&src, &real_file).unwrap();
+
+ let link_file = dir.path().join("linked.har");
+ unix_fs::symlink(&real_file, &link_file).unwrap();
+
+ let iter = mitm2openapi::har_reader::stream_har_dir_no_symlinks(dir.path());
+ assert!(iter.is_ok(), "should open directory");
+ let results: Vec<_> = iter.unwrap().filter_map(|r| r.ok()).collect();
+
+ assert_eq!(
+ results.len(),
+ 1,
+ "only the real HAR file should be processed, symlinked entry skipped"
+ );
+}