diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..b353dd9 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,84 @@ +name: Docs + +on: + push: + branches: [main] + paths: + - 'book/**' + - 'docs/**' + - '.github/workflows/docs.yml' + - 'CHANGELOG.md' + - 'CONTRIBUTING.md' + - 'README.md' + pull_request: + paths: + - 'book/**' + - 'docs/**' + - '.github/workflows/docs.yml' + - 'CHANGELOG.md' + - 'CONTRIBUTING.md' + - 'README.md' + workflow_dispatch: + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: docs-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: dtolnay/rust-toolchain@stable + + # Bump cache key when any tool version in the install step changes + - name: Cache mdbook binaries + id: cache-mdbook + uses: actions/cache@v5 + with: + path: ~/.cargo/bin/mdbook* + key: mdbook-v2-${{ hashFiles('.github/workflows/docs.yml') }} + + - name: Install mdbook and plugins + if: steps.cache-mdbook.outputs.cache-hit != 'true' + run: | + cargo install \ + mdbook@0.4.40 \ + mdbook-linkcheck@0.7.7 \ + mdbook-toc@0.14.2 \ + mdbook-admonish@1.18.0 \ + mdbook-mermaid@0.14.1 + + - name: Build book + run: mdbook build book + + - name: Upload Pages artifact + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' + uses: actions/upload-pages-artifact@v4 + with: + path: target/book/html + + - name: Verify build (PR) + if: github.event_name == 'pull_request' + run: | + test -f target/book/html/index.html + test -s target/book/html/index.html + echo "Build OK" + + deploy: + needs: build + if: github.event_name == 'push' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.gitignore b/.gitignore index 416c1dd..4aa5334 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,9 @@ result-* # Ruff cache (leftover from Python tooling) .ruff_cache/ +# mdBook build output +/target/book/ + # integration test artifacts tests/integration/level1/fixtures/*.flow tests/integration/level1/out/ diff --git a/CHANGELOG.md b/CHANGELOG.md index ea1c55c..2d855a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,3 @@ -# Changelog - All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 81fa319..d50a310 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,3 @@ -# Contributing — Local Testing Guide - This document covers how to run the three test tracks locally. ## Prerequisites diff --git a/Cargo.toml b/Cargo.toml index 7f3d875..045a156 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ exclude = [ "docs/demo.mp4", ".github/**", "scripts/**", "flake.nix", "flake.lock", ".envrc", ".direnv/**", ".sisyphus/**", ".ruff_cache/**", + "book/**", "docs/**", ] [[bin]] diff --git a/README.md b/README.md index ab7bffe..1ac5ef1 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ A Rust rewrite of [mitmproxy2swagger](https://github.com/alufers/mitmproxy2swagg [![Crates.io](https://img.shields.io/crates/v/mitm2openapi.svg)](https://crates.io/crates/mitm2openapi) [![Downloads](https://img.shields.io/crates/d/mitm2openapi.svg)](https://crates.io/crates/mitm2openapi) [![docs.rs](https://img.shields.io/docsrs/mitm2openapi)](https://docs.rs/mitm2openapi) +[![docs](https://img.shields.io/badge/docs-arkptz.github.io-blue)](https://arkptz.github.io/mitm2openapi/) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) Demo: capture → discover → generate → browse Swagger UI @@ -39,17 +40,13 @@ Credit to [@alufers](https://github.com/alufers) for the original tool that pion ## Installation -### From binary releases - -Download a pre-built binary from [GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases). - -### From source - ```bash -cargo install --git https://github.com/Arkptz/mitm2openapi +cargo install mitm2openapi ``` -## Quick Start +Or download a pre-built binary from [GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases). + +## Quick start ```bash # 1. Capture traffic with mitmproxy @@ -64,206 +61,13 @@ mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example. mitm2openapi generate -i capture.flow -t templates.yaml -o openapi.yaml -p "https://api.example.com" ``` -### Skip the manual edit - -If you know which paths you care about up front, use `--exclude-patterns` -and `--include-patterns` to let `discover` do the curation: - -```bash -mitm2openapi discover \ - -i capture.flow -o templates.yaml -p "https://api.example.com" \ - --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg' \ - --include-patterns '/api/**,/v2/**' - -mitm2openapi generate \ - -i capture.flow -t templates.yaml -o openapi.yaml -p "https://api.example.com" -``` - -Paths matching `--include-patterns` are auto-activated (emitted without -the `ignore:` prefix). Paths matching `--exclude-patterns` are dropped -entirely. Everything else still gets `ignore:` for manual review. - -
-CLI Reference (click to expand) - -### `discover` - -Scan captured traffic and produce a templates file listing all observed endpoints. - -``` -mitm2openapi discover [OPTIONS] -i -o -p -``` - -| Option | Description | -|--------|-------------| -| `-i, --input ` | Input file (flow dump or HAR) | -| `-o, --output ` | Output YAML templates file | -| `-p, --prefix ` | API prefix URL to filter requests | -| `--format ` | Input format: `auto`, `har`, `mitmproxy` (default: `auto`) | -| `--exclude-patterns ` | Comma-separated globs; matching paths are dropped entirely. `*` = single segment, `**` = any subtree. E.g. `/static/**,*.css` | -| `--include-patterns ` | Comma-separated globs; matching paths are emitted without `ignore:` (auto-activated for `generate`) | -| `--max-input-size ` | Maximum input file size (default: `2GiB`). Accepts suffixes: `KiB`, `MiB`, `GiB` | -| `--allow-symlinks` | Allow symlinked input files (default: rejected for safety) | -| `--strict` | Treat warnings as errors; exit code 2 if any cap fires, flow is rejected, or parse error occurs | -| `--report ` | Write a structured JSON processing report to the given path | - -### `generate` - -Generate an OpenAPI 3.0 spec from captured traffic using a curated templates file. - -``` -mitm2openapi generate [OPTIONS] -i -t -o -p -``` - -| Option | Description | -|--------|-------------| -| `-i, --input ` | Input file (flow dump or HAR) | -| `-t, --templates ` | Templates YAML file (from `discover`) | -| `-o, --output ` | Output OpenAPI YAML file | -| `-p, --prefix ` | API prefix URL | -| `--format ` | Input format: `auto`, `har`, `mitmproxy` (default: `auto`) | -| `--openapi-title ` | Custom title for the spec | -| `--openapi-version <VER>` | Custom spec version (default: `1.0.0`) | -| `--exclude-headers <LIST>` | Comma-separated headers to exclude | -| `--exclude-cookies <LIST>` | Comma-separated cookies to exclude | -| `--include-headers` | Include headers in the spec | -| `--ignore-images` | Ignore image content types | -| `--suppress-params` | Suppress parameter suggestions | -| `--tags-overrides <JSON>` | JSON string for tag overrides | -| `--max-input-size <BYTES>` | Maximum input file size (default: `2GiB`). Accepts suffixes: `KiB`, `MiB`, `GiB` | -| `--max-payload-size <BYTES>` | Maximum tnetstring payload size (default: `256MiB`) | -| `--max-depth <N>` | Maximum tnetstring nesting depth (default: `256`) | -| `--max-body-size <BYTES>` | Maximum request/response body size (default: `64MiB`) | -| `--allow-symlinks` | Allow symlinked input files (default: rejected for safety) | -| `--strict` | Treat warnings as errors; exit code 2 if any cap fires, flow is rejected, or parse error occurs | -| `--report <PATH>` | Write a structured JSON processing report to the given path | - -</details> - -## Resource Limits - -To prevent denial-of-service when processing untrusted captures, `mitm2openapi` -enforces several configurable limits: - -| Flag | Default | Purpose | -|------|---------|---------| -| `--max-input-size` | 2 GiB | Reject files larger than this before reading | -| `--max-payload-size` | 256 MiB | Cap on individual tnetstring payload allocation | -| `--max-depth` | 256 | Recursion depth limit for nested tnetstring structures | -| `--max-body-size` | 64 MiB | Maximum request/response body considered during schema inference | -| `--allow-symlinks` | off | By default, symlinked inputs are rejected to prevent path-traversal on shared CI runners | - -In addition to the configurable limits above, the following per-field caps are -applied unconditionally to prevent data corruption: - -| Field | Cap | Behaviour | -|-------|-----|-----------| -| Header name | 8 KiB | Dropped (other headers still processed) | -| Header value | 64 KiB | Truncated to cap | -| Form fields per request | 1 000 | Excess fields ignored | -| URL scheme | `http` / `https` only | Non-HTTP flows silently skipped | -| Port number | 1–65 535 | Out-of-range port drops the request | -| HTTP status code | 100–599 | Invalid codes treated as no response | - -Identity fields (scheme, host, path, method, header names) require valid UTF-8. -Flows with non-UTF-8 identity bytes are skipped to prevent data aliasing through -replacement-character collisions. Control characters in paths are stripped -automatically. - -Increase `--max-input-size` if you work with captures larger than 2 GiB (e.g. -`--max-input-size 8GiB`). The other limits rarely need tuning. - -Both mitmproxy flow files and HAR files are processed incrementally — memory usage -stays bounded regardless of input size. - -## Diagnostics - -When the tnetstring parser encounters corruption in a mitmproxy flow file, it -halts and emits a warn-level log with the byte offset, number of successfully -parsed entries, and an error classification. No resync is attempted — binary -payloads can contain bytes that mimic valid tnetstring length prefixes, so -scanning forward would produce phantom flows. - -### Structured report (`--report`) - -Pass `--report <PATH>` to either `discover` or `generate` to write a JSON -processing summary. This is useful for CI pipelines that need structured data -instead of log scraping. - -```json -{ - "report_version": 1, - "tool_version": "0.2.3", - "input": { - "path": "capture.flow", - "format": "Auto", - "size_bytes": 102400 - }, - "result": { - "flows_read": 150, - "flows_emitted": 148, - "paths_in_spec": 12 - }, - "events": { - "parse_error": { - "TNetString parse error at byte 98304: unexpected end of input": 1 - } - } -} -``` - -### Strict mode +## Documentation -Pass `--strict` to either `discover` or `generate` to treat any warning-level -event as a hard failure. The process exits with code 2 if any resource cap -fired, a flow was rejected, or a parse error was encountered. - -This is designed for CI gates where silent degradation is unacceptable: - -```bash -mitm2openapi discover -i capture.flow -o templates.yaml -p https://api.example.com --strict \ - || echo "FAIL: corrupt or over-limit flows detected" -``` - -Without `--strict`, the same conditions are logged at warn level and processing -continues (exit code 0). - -## Supported Formats - -| Format | Versions | Extension | -|--------|----------|-----------| -| mitmproxy flow dumps | v19, v20, v21 | `.flow` | -| HAR (HTTP Archive) | 1.2 (incrementally parsed) | `.har` | - -Format is auto-detected from file content. Use `--format` to override. - -## Migration from Python mitmproxy2swagger - -| Python (`mitmproxy2swagger`) | Rust (`mitm2openapi`) | -|-----|-----| -| `pip install mitmproxy2swagger` | Single binary, no runtime | -| `mitmproxy2swagger -i <file> -o <spec> -p <prefix>` | Two-step: `discover` then `generate` | -| Edits spec file in-place | Separate templates file for curation | -| Requires Python 3.x + mitmproxy | Standalone binary | -| Supports mitmproxy only | Supports mitmproxy flow dumps + HAR | - -### Key differences - -- **Two-step workflow**: `discover` produces a templates file; you curate it; `generate` produces the final spec. This separates endpoint selection from spec generation. -- **Templates file**: Discovered endpoints are prefixed with `ignore:`. Remove the prefix to include an endpoint. This replaces editing the output spec directly. -- **No Python dependency**: Ships as a single static binary for Linux, macOS, and Windows. -- **HAR support**: Process HAR exports from browser DevTools or other HTTP tools. +Full documentation at **[arkptz.github.io/mitm2openapi](https://arkptz.github.io/mitm2openapi/)** — covers installation, traffic capture setup, the full discover → curate → generate pipeline, CLI reference, resource limits, filtering, strict mode, format details, benchmarks, and security model. ## Benchmarks -Automated CI benchmark runs weekly against the Python original -([`mitmproxy2swagger`](https://github.com/alufers/mitmproxy2swagger)). See -[docs/benchmarks.md](docs/benchmarks.md) for the latest timing and memory -comparison on a ~80 MB synthetic capture, or -trigger a fresh run via -[Actions → Benchmark](../../actions/workflows/bench.yml). - -Reproduce locally with the commands documented in the workflow file. +Automated CI benchmarks run weekly against the Python original. See [docs/benchmarks.md](docs/benchmarks.md) for the latest comparison on a ~80 MB synthetic capture. ## Contributing diff --git a/book/book.toml b/book/book.toml new file mode 100644 index 0000000..5567b70 --- /dev/null +++ b/book/book.toml @@ -0,0 +1,58 @@ +[book] +title = "mitm2openapi" +authors = ["Arkptz"] +description = "Convert mitmproxy flow dumps and HAR files to OpenAPI 3.0 specs" +src = "src" +language = "en" + +[build] +build-dir = "../target/book" +create-missing = false + +[output.html] +git-repository-url = "https://github.com/Arkptz/mitm2openapi" +edit-url-template = "https://github.com/Arkptz/mitm2openapi/edit/main/book/{path}" +default-theme = "ayu" +preferred-dark-theme = "ayu" +site-url = "/mitm2openapi/" +additional-css = ["./mdbook-admonish.css"] + +[output.html.fold] +enable = true +level = 1 + +[output.html.search] +enable = true +limit-results = 20 +teaser-word-count = 30 +use-boolean-and = true +boost-title = 2 +boost-hierarchy = 1 +boost-paragraph = 1 +expand = true +heading-split-level = 3 + +[output.linkcheck] +warning-policy = "error" +follow-web-links = false +exclude = [ + # CHANGELOG: [Unreleased], [skip ci], [0.1.0] + '^Unreleased$', + '^skip ci$', + '^\d+\.\d+(\.\d+)?$', + # Benchmark table units: Mean [s], Min [s], Max [s] + '^s$', + # Benchmark workflow path inside included docs/benchmarks.md + '\.github/workflows/bench\.yml', +] + +[preprocessor.toc] +command = "mdbook-toc" +renderer = ["html"] + +[preprocessor.admonish] +command = "mdbook-admonish" +assets_version = "3.0.2" # do not edit: managed by `mdbook-admonish install` + +[preprocessor.mermaid] +command = "mdbook-mermaid" diff --git a/book/mdbook-admonish.css b/book/mdbook-admonish.css new file mode 100644 index 0000000..45aeff0 --- /dev/null +++ b/book/mdbook-admonish.css @@ -0,0 +1,348 @@ +@charset "UTF-8"; +:is(.admonition) { + display: flow-root; + margin: 1.5625em 0; + padding: 0 1.2rem; + color: var(--fg); + page-break-inside: avoid; + background-color: var(--bg); + border: 0 solid black; + border-inline-start-width: 0.4rem; + border-radius: 0.2rem; + box-shadow: 0 0.2rem 1rem rgba(0, 0, 0, 0.05), 0 0 0.1rem rgba(0, 0, 0, 0.1); +} +@media print { + :is(.admonition) { + box-shadow: none; + } +} +:is(.admonition) > * { + box-sizing: border-box; +} +:is(.admonition) :is(.admonition) { + margin-top: 1em; + margin-bottom: 1em; +} +:is(.admonition) > .tabbed-set:only-child { + margin-top: 0; +} +html :is(.admonition) > :last-child { + margin-bottom: 1.2rem; +} + +a.admonition-anchor-link { + display: none; + position: absolute; + left: -1.2rem; + padding-right: 1rem; +} +a.admonition-anchor-link:link, a.admonition-anchor-link:visited { + color: var(--fg); +} +a.admonition-anchor-link:link:hover, a.admonition-anchor-link:visited:hover { + text-decoration: none; +} +a.admonition-anchor-link::before { + content: "§"; +} + +:is(.admonition-title, summary.admonition-title) { + position: relative; + min-height: 4rem; + margin-block: 0; + margin-inline: -1.6rem -1.2rem; + padding-block: 0.8rem; + padding-inline: 4.4rem 1.2rem; + font-weight: 700; + background-color: rgba(68, 138, 255, 0.1); + print-color-adjust: exact; + -webkit-print-color-adjust: exact; + display: flex; +} +:is(.admonition-title, summary.admonition-title) p { + margin: 0; +} +html :is(.admonition-title, summary.admonition-title):last-child { + margin-bottom: 0; +} +:is(.admonition-title, summary.admonition-title)::before { + position: absolute; + top: 0.625em; + inset-inline-start: 1.6rem; + width: 2rem; + height: 2rem; + background-color: #448aff; + print-color-adjust: exact; + -webkit-print-color-adjust: exact; + mask-image: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"></svg>'); + -webkit-mask-image: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"></svg>'); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-size: contain; + content: ""; +} +:is(.admonition-title, summary.admonition-title):hover a.admonition-anchor-link { + display: initial; +} + +details.admonition > summary.admonition-title::after { + position: absolute; + top: 0.625em; + inset-inline-end: 1.6rem; + height: 2rem; + width: 2rem; + background-color: currentcolor; + mask-image: var(--md-details-icon); + -webkit-mask-image: var(--md-details-icon); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-size: contain; + content: ""; + transform: rotate(0deg); + transition: transform 0.25s; +} +details[open].admonition > summary.admonition-title::after { + transform: rotate(90deg); +} + +:root { + --md-details-icon: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M8.59 16.58 13.17 12 8.59 7.41 10 6l6 6-6 6-1.41-1.42Z'/></svg>"); +} + +:root { + --md-admonition-icon--admonish-note: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M20.71 7.04c.39-.39.39-1.04 0-1.41l-2.34-2.34c-.37-.39-1.02-.39-1.41 0l-1.84 1.83 3.75 3.75M3 17.25V21h3.75L17.81 9.93l-3.75-3.75L3 17.25z'/></svg>"); + --md-admonition-icon--admonish-abstract: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M17 9H7V7h10m0 6H7v-2h10m-3 6H7v-2h7M12 3a1 1 0 0 1 1 1 1 1 0 0 1-1 1 1 1 0 0 1-1-1 1 1 0 0 1 1-1m7 0h-4.18C14.4 1.84 13.3 1 12 1c-1.3 0-2.4.84-2.82 2H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2V5a2 2 0 0 0-2-2z'/></svg>"); + --md-admonition-icon--admonish-info: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M13 9h-2V7h2m0 10h-2v-6h2m-1-9A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10A10 10 0 0 0 12 2z'/></svg>"); + --md-admonition-icon--admonish-tip: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M17.66 11.2c-.23-.3-.51-.56-.77-.82-.67-.6-1.43-1.03-2.07-1.66C13.33 7.26 13 4.85 13.95 3c-.95.23-1.78.75-2.49 1.32-2.59 2.08-3.61 5.75-2.39 8.9.04.1.08.2.08.33 0 .22-.15.42-.35.5-.23.1-.47.04-.66-.12a.58.58 0 0 1-.14-.17c-1.13-1.43-1.31-3.48-.55-5.12C5.78 10 4.87 12.3 5 14.47c.06.5.12 1 .29 1.5.14.6.41 1.2.71 1.73 1.08 1.73 2.95 2.97 4.96 3.22 2.14.27 4.43-.12 6.07-1.6 1.83-1.66 2.47-4.32 1.53-6.6l-.13-.26c-.21-.46-.77-1.26-.77-1.26m-3.16 6.3c-.28.24-.74.5-1.1.6-1.12.4-2.24-.16-2.9-.82 1.19-.28 1.9-1.16 2.11-2.05.17-.8-.15-1.46-.28-2.23-.12-.74-.1-1.37.17-2.06.19.38.39.76.63 1.06.77 1 1.98 1.44 2.24 2.8.04.14.06.28.06.43.03.82-.33 1.72-.93 2.27z'/></svg>"); + --md-admonition-icon--admonish-success: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='m9 20.42-6.21-6.21 2.83-2.83L9 14.77l9.88-9.89 2.83 2.83L9 20.42z'/></svg>"); + --md-admonition-icon--admonish-question: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='m15.07 11.25-.9.92C13.45 12.89 13 13.5 13 15h-2v-.5c0-1.11.45-2.11 1.17-2.83l1.24-1.26c.37-.36.59-.86.59-1.41a2 2 0 0 0-2-2 2 2 0 0 0-2 2H8a4 4 0 0 1 4-4 4 4 0 0 1 4 4 3.2 3.2 0 0 1-.93 2.25M13 19h-2v-2h2M12 2A10 10 0 0 0 2 12a10 10 0 0 0 10 10 10 10 0 0 0 10-10c0-5.53-4.5-10-10-10z'/></svg>"); + --md-admonition-icon--admonish-warning: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M13 14h-2V9h2m0 9h-2v-2h2M1 21h22L12 2 1 21z'/></svg>"); + --md-admonition-icon--admonish-failure: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M20 6.91 17.09 4 12 9.09 6.91 4 4 6.91 9.09 12 4 17.09 6.91 20 12 14.91 17.09 20 20 17.09 14.91 12 20 6.91z'/></svg>"); + --md-admonition-icon--admonish-danger: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M11 15H6l7-14v8h5l-7 14v-8z'/></svg>"); + --md-admonition-icon--admonish-bug: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M14 12h-4v-2h4m0 6h-4v-2h4m6-6h-2.81a5.985 5.985 0 0 0-1.82-1.96L17 4.41 15.59 3l-2.17 2.17a6.002 6.002 0 0 0-2.83 0L8.41 3 7 4.41l1.62 1.63C7.88 6.55 7.26 7.22 6.81 8H4v2h2.09c-.05.33-.09.66-.09 1v1H4v2h2v1c0 .34.04.67.09 1H4v2h2.81c1.04 1.79 2.97 3 5.19 3s4.15-1.21 5.19-3H20v-2h-2.09c.05-.33.09-.66.09-1v-1h2v-2h-2v-1c0-.34-.04-.67-.09-1H20V8z'/></svg>"); + --md-admonition-icon--admonish-example: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M7 13v-2h14v2H7m0 6v-2h14v2H7M7 7V5h14v2H7M3 8V5H2V4h2v4H3m-1 9v-1h3v4H2v-1h2v-.5H3v-1h1V17H2m2.25-7a.75.75 0 0 1 .75.75c0 .2-.08.39-.21.52L3.12 13H5v1H2v-.92L4 11H2v-1h2.25z'/></svg>"); + --md-admonition-icon--admonish-quote: url("data:image/svg+xml;charset=utf-8,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 24 24'><path d='M14 17h3l2-4V7h-6v6h3M6 17h3l2-4V7H5v6h3l-2 4z'/></svg>"); +} + +:is(.admonition):is(.admonish-note) { + border-color: #448aff; +} + +:is(.admonish-note) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(68, 138, 255, 0.1); +} +:is(.admonish-note) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #448aff; + mask-image: var(--md-admonition-icon--admonish-note); + -webkit-mask-image: var(--md-admonition-icon--admonish-note); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-abstract, .admonish-summary, .admonish-tldr) { + border-color: #00b0ff; +} + +:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(0, 176, 255, 0.1); +} +:is(.admonish-abstract, .admonish-summary, .admonish-tldr) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #00b0ff; + mask-image: var(--md-admonition-icon--admonish-abstract); + -webkit-mask-image: var(--md-admonition-icon--admonish-abstract); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-info, .admonish-todo) { + border-color: #00b8d4; +} + +:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(0, 184, 212, 0.1); +} +:is(.admonish-info, .admonish-todo) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #00b8d4; + mask-image: var(--md-admonition-icon--admonish-info); + -webkit-mask-image: var(--md-admonition-icon--admonish-info); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-tip, .admonish-hint, .admonish-important) { + border-color: #00bfa5; +} + +:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(0, 191, 165, 0.1); +} +:is(.admonish-tip, .admonish-hint, .admonish-important) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #00bfa5; + mask-image: var(--md-admonition-icon--admonish-tip); + -webkit-mask-image: var(--md-admonition-icon--admonish-tip); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-success, .admonish-check, .admonish-done) { + border-color: #00c853; +} + +:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(0, 200, 83, 0.1); +} +:is(.admonish-success, .admonish-check, .admonish-done) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #00c853; + mask-image: var(--md-admonition-icon--admonish-success); + -webkit-mask-image: var(--md-admonition-icon--admonish-success); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-question, .admonish-help, .admonish-faq) { + border-color: #64dd17; +} + +:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(100, 221, 23, 0.1); +} +:is(.admonish-question, .admonish-help, .admonish-faq) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #64dd17; + mask-image: var(--md-admonition-icon--admonish-question); + -webkit-mask-image: var(--md-admonition-icon--admonish-question); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-warning, .admonish-caution, .admonish-attention) { + border-color: #ff9100; +} + +:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(255, 145, 0, 0.1); +} +:is(.admonish-warning, .admonish-caution, .admonish-attention) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #ff9100; + mask-image: var(--md-admonition-icon--admonish-warning); + -webkit-mask-image: var(--md-admonition-icon--admonish-warning); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-failure, .admonish-fail, .admonish-missing) { + border-color: #ff5252; +} + +:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(255, 82, 82, 0.1); +} +:is(.admonish-failure, .admonish-fail, .admonish-missing) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #ff5252; + mask-image: var(--md-admonition-icon--admonish-failure); + -webkit-mask-image: var(--md-admonition-icon--admonish-failure); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-danger, .admonish-error) { + border-color: #ff1744; +} + +:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(255, 23, 68, 0.1); +} +:is(.admonish-danger, .admonish-error) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #ff1744; + mask-image: var(--md-admonition-icon--admonish-danger); + -webkit-mask-image: var(--md-admonition-icon--admonish-danger); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-bug) { + border-color: #f50057; +} + +:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(245, 0, 87, 0.1); +} +:is(.admonish-bug) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #f50057; + mask-image: var(--md-admonition-icon--admonish-bug); + -webkit-mask-image: var(--md-admonition-icon--admonish-bug); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-example) { + border-color: #7c4dff; +} + +:is(.admonish-example) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(124, 77, 255, 0.1); +} +:is(.admonish-example) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #7c4dff; + mask-image: var(--md-admonition-icon--admonish-example); + -webkit-mask-image: var(--md-admonition-icon--admonish-example); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +:is(.admonition):is(.admonish-quote, .admonish-cite) { + border-color: #9e9e9e; +} + +:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title) { + background-color: rgba(158, 158, 158, 0.1); +} +:is(.admonish-quote, .admonish-cite) > :is(.admonition-title, summary.admonition-title)::before { + background-color: #9e9e9e; + mask-image: var(--md-admonition-icon--admonish-quote); + -webkit-mask-image: var(--md-admonition-icon--admonish-quote); + mask-repeat: no-repeat; + -webkit-mask-repeat: no-repeat; + mask-size: contain; + -webkit-mask-repeat: no-repeat; +} + +.navy :is(.admonition) { + background-color: var(--sidebar-bg); +} + +.ayu :is(.admonition), +.coal :is(.admonition) { + background-color: var(--theme-hover); +} + +.rust :is(.admonition) { + background-color: var(--sidebar-bg); + color: var(--sidebar-fg); +} +.rust .admonition-anchor-link:link, .rust .admonition-anchor-link:visited { + color: var(--sidebar-fg); +} diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md new file mode 100644 index 0000000..4acbded --- /dev/null +++ b/book/src/SUMMARY.md @@ -0,0 +1,30 @@ +# Summary + +[Introduction](./introduction.md) + +# Getting started +- [Installation](./getting-started/installation.md) +- [Quick start](./getting-started/quick-start.md) +- [Capturing traffic](./getting-started/capturing.md) + +# Usage +- [Discover, curate, generate](./usage/pipeline.md) +- [Filtering endpoints](./usage/filtering.md) +- [Resource limits](./usage/resource-limits.md) +- [Strict mode](./usage/strict-mode.md) +- [Processing reports](./usage/reports.md) +- [CLI reference](./usage/cli-reference.md) + +# Formats +- [mitmproxy flow dumps](./formats/mitmproxy.md) +- [HAR files](./formats/har.md) + +# Reference +- [Performance & benchmarks](./reference/benchmarks.md) +- [Security model](./reference/security.md) +- [Diagnostics](./reference/diagnostics.md) + +--- + +[Changelog](./changelog.md) +[Contributing](./contributing.md) diff --git a/book/src/changelog.md b/book/src/changelog.md new file mode 100644 index 0000000..9b9ae8b --- /dev/null +++ b/book/src/changelog.md @@ -0,0 +1,7 @@ +# Changelog + +{{#include ../../CHANGELOG.md}} + +[Unreleased]: https://github.com/Arkptz/mitm2openapi/commits/main +[skip ci]: #changelog +[0.1.0]: https://github.com/Arkptz/mitm2openapi/releases/tag/v0.1.0 diff --git a/book/src/contributing.md b/book/src/contributing.md new file mode 100644 index 0000000..2913f2f --- /dev/null +++ b/book/src/contributing.md @@ -0,0 +1,3 @@ +# Contributing + +{{#include ../../CONTRIBUTING.md}} diff --git a/book/src/formats/har.md b/book/src/formats/har.md new file mode 100644 index 0000000..8d2ddce --- /dev/null +++ b/book/src/formats/har.md @@ -0,0 +1,77 @@ +# HAR files + +`mitm2openapi` reads [HAR (HTTP Archive)](https://w3c.github.io/web-performance/specs/HAR/Overview.html) +files — the standard format for exporting browser network traffic. HAR version 1.2 is supported. + +## Producing HAR files + +### Browser DevTools + +All modern browsers export HAR from their Network tab: + +- **Chrome/Chromium**: DevTools → Network → right-click → "Save all as HAR with content" +- **Firefox**: DevTools → Network → gear icon → "Save All As HAR" +- **Safari**: Web Inspector → Network → Export button + +### HTTP proxies + +Several proxy tools export HAR: + +- [Charles Proxy](https://www.charlesproxy.com/) — File → Export Session → HAR +- [Fiddler](https://www.telerik.com/fiddler) — File → Export Sessions → HTTPArchive +- [Proxyman](https://proxyman.io/) — Export as HAR + +### Programmatic generation + +Libraries like [`puppeteer`](https://pptr.dev/) and [`playwright`](https://playwright.dev/) +can produce HAR files from automated browser sessions: + +```javascript +// Playwright example +const context = await browser.newContext({ + recordHar: { path: 'capture.har' } +}); +// ... run your test +await context.close(); // HAR is written on close +``` + +## Usage + +```bash +mitm2openapi discover \ + -i capture.har \ + -o templates.yaml \ + -p "https://api.example.com" +``` + +Format is auto-detected. Use `--format har` to force HAR parsing if auto-detection fails. + +## HAR vs mitmproxy flows + +| Aspect | mitmproxy flow | HAR | +|--------|---------------|-----| +| Source | mitmproxy proxy | Browser DevTools, HTTP proxies | +| Format | Binary (tnetstring) | JSON | +| Response bodies | Always present | Sometimes base64-encoded | +| HTTPS | Decrypted by proxy | Decrypted by browser | +| File size | Compact binary | Larger (JSON overhead) | +| Streaming | Native | Incremental JSON parsing | + +Both formats produce equivalent OpenAPI specs. Choose based on your capture workflow: + +- **mitmproxy flows** for server-side proxying, CI pipelines, and automated captures +- **HAR files** for browser-based testing, manual exploration, and when you already have DevTools open + +## Incremental parsing + +HAR files are parsed incrementally — the entire JSON is not loaded into memory at once. +This means memory usage stays bounded even for large HAR exports (hundreds of megabytes). + +## Known limitations + +- **Base64-encoded bodies** — some HAR exporters base64-encode response bodies. Decode + failures are logged as warnings and the body is skipped (not silently dropped). +- **Compressed content** — if the HAR exporter did not decompress response bodies, + `mitm2openapi` sees the compressed bytes. Most browser DevTools decompress automatically. +- **Timing data** — HAR timing information (DNS, connect, TLS) is ignored; only request and + response data is used for spec generation. diff --git a/book/src/formats/mitmproxy.md b/book/src/formats/mitmproxy.md new file mode 100644 index 0000000..46a2b65 --- /dev/null +++ b/book/src/formats/mitmproxy.md @@ -0,0 +1,58 @@ +# mitmproxy flow dumps + +`mitm2openapi` reads mitmproxy's native binary flow format. This is the recommended input +format — it captures the richest data and is produced directly by `mitmdump` and `mitmweb`. + +## Supported versions + +| Flow format version | mitmproxy version | Status | +|---|---|---| +| v19 | mitmproxy 8.x | Supported | +| v20 | mitmproxy 9.x | Supported | +| v21 | mitmproxy 10.x | Supported | + +The flow format is auto-detected from file content. No version flag is needed. + +## How flow files work + +Flow files use the [tnetstring](https://tnetstrings.info/) serialization format. Each flow +is a sequence of key-value pairs representing a complete HTTP request-response cycle. + +A typical flow contains: + +- **Request**: method, URL (scheme, host, port, path), headers, body +- **Response**: status code, headers, body +- **Metadata**: timestamps, flow ID, client/server addresses + +`mitm2openapi` extracts the request and response data relevant to OpenAPI spec generation +and discards metadata. + +## Capturing flow files + +```bash +# Record all traffic through the proxy +mitmdump -w capture.flow + +# Record only traffic to a specific host +mitmdump -w capture.flow --set flow_detail=0 \ + --set save_stream_filter='~d api.example.com' +``` + +See [capturing traffic](../getting-started/capturing.md) for full setup instructions. + +## Directory input + +If you pass a directory path to `-i`, `mitm2openapi` reads all `.flow` files in that +directory (non-recursive). This is useful when you have traffic split across multiple +capture sessions. + +## Known limitations + +- **No WebSocket frames** — WebSocket upgrade requests are captured, but frame-level data + is not used for spec generation +- **No gRPC** — binary protocol buffers inside HTTP/2 frames are not decoded +- **Corrupt files** — when the tnetstring parser hits corruption, it stops and reports the + byte offset. No resync is attempted because binary payloads can contain bytes that mimic + valid tnetstring length prefixes. See [diagnostics](../reference/diagnostics.md) for details. +- **Large payloads** — individual tnetstring payloads are capped at 256 MiB by default + (adjustable via `--max-payload-size`) diff --git a/book/src/getting-started/capturing.md b/book/src/getting-started/capturing.md new file mode 100644 index 0000000..999ac39 --- /dev/null +++ b/book/src/getting-started/capturing.md @@ -0,0 +1,121 @@ +# Capturing traffic + +Before you can generate an OpenAPI spec, you need a captured traffic file. This chapter +covers the most common ways to capture HTTP traffic. + +## Option 1: mitmproxy (recommended) + +[mitmproxy](https://mitmproxy.org/) is a free, open-source HTTPS proxy. It captures traffic +in its own binary flow format that `mitm2openapi` reads natively. + +### Install mitmproxy + +```bash +# macOS +brew install mitmproxy + +# Linux (pip) +pip install mitmproxy + +# Or download from https://mitmproxy.org/ +``` + +See the [mitmproxy installation docs](https://docs.mitmproxy.org/stable/overview-installation/) +for platform-specific instructions. + +### Capture with mitmdump + +`mitmdump` is the non-interactive version of mitmproxy, ideal for scripted captures: + +```bash +# Start the proxy and write all traffic to a flow file +mitmdump -w capture.flow + +# In another terminal, route your HTTP client through the proxy: +curl --proxy http://localhost:8080 https://api.example.com/users +``` + +The default proxy port is 8080. Use `-p` to change it: + +```bash +mitmdump -w capture.flow -p 9090 +``` + +### Capture with mitmweb + +`mitmweb` provides a browser-based UI for inspecting traffic in real time: + +```bash +mitmweb -w capture.flow +# Open http://localhost:8081 in your browser to inspect traffic +``` + +### HTTPS traffic + +For HTTPS, you need to install the mitmproxy CA certificate on the client machine. +After starting mitmproxy, navigate to `http://mitm.it` from the proxied client to +download and install the certificate. + +See the [mitmproxy certificate docs](https://docs.mitmproxy.org/stable/concepts-certificates/) +for detailed instructions. + +### Tips + +- Use `mitmdump --set flow_detail=0` for minimal console output during long captures +- Combine with `--set save_stream_filter` to capture only specific hosts +- The flow format is versioned (v19/v20/v21) — `mitm2openapi` supports all three + +## Option 2: Browser DevTools (HAR export) + +All modern browsers can export captured network traffic as HAR (HTTP Archive) files. + +### Chrome / Chromium + +1. Open DevTools (`F12` or `Ctrl+Shift+I`) +2. Switch to the **Network** tab +3. Ensure recording is active (red circle icon) +4. Perform the actions you want to capture +5. Right-click in the request list → **Save all as HAR with content** + +### Firefox + +1. Open DevTools (`F12`) +2. Switch to the **Network** tab +3. Perform the actions you want to capture +4. Click the gear icon → **Save All As HAR** + +### Safari + +1. Enable the Develop menu in Preferences → Advanced +2. Open Web Inspector (`Cmd+Option+I`) +3. Switch to the **Network** tab +4. Perform the actions +5. Click **Export** in the toolbar + +```admonish note +HAR files from browser DevTools contain the full request and response bodies. Sensitive data +(cookies, tokens, passwords) will be present in the export. Sanitize before sharing. +``` + +## Option 3: Other HTTP proxies + +Any tool that produces HAR 1.2 output works with `mitm2openapi`: + +- [Charles Proxy](https://www.charlesproxy.com/) — export sessions as HAR via File → Export +- [Fiddler](https://www.telerik.com/fiddler) — File → Export Sessions → HTTPArchive +- [Proxyman](https://proxyman.io/) — export as HAR from the session menu + +## What to capture + +For the best OpenAPI spec, capture diverse traffic: + +- **Multiple endpoints** — the more paths covered, the more complete the spec +- **Different HTTP methods** — GET, POST, PUT, DELETE on the same resource +- **Various response codes** — 200, 400, 404, 500 responses produce richer schemas +- **Query parameters** — include requests with different query strings +- **Request bodies** — POST/PUT with different payloads improve body schema inference + +## Next steps + +Once you have a capture file, proceed to the [quick start](./quick-start.md) or +learn about the full [discover → curate → generate pipeline](../usage/pipeline.md). diff --git a/book/src/getting-started/installation.md b/book/src/getting-started/installation.md new file mode 100644 index 0000000..fc8a69c --- /dev/null +++ b/book/src/getting-started/installation.md @@ -0,0 +1,42 @@ +# Installation + +## From binary releases + +Download a pre-built binary for your platform from +[GitHub Releases](https://github.com/Arkptz/mitm2openapi/releases). + +Binaries are available for Linux (x86_64, aarch64), macOS (x86_64, aarch64), and +Windows (x86_64). + +```bash +# Example: Linux x86_64 — replace <VERSION> with the release tag (e.g. v0.5.1) +curl -L "https://github.com/Arkptz/mitm2openapi/releases/download/<VERSION>/mitm2openapi-<VERSION>-x86_64-unknown-linux-gnu.tar.gz" \ + | tar xz +sudo mv mitm2openapi /usr/local/bin/ +``` + +## From source (via Cargo) + +If you have a Rust toolchain installed: + +```bash +cargo install --git https://github.com/Arkptz/mitm2openapi +``` + +Or from [crates.io](https://crates.io/crates/mitm2openapi): + +```bash +cargo install mitm2openapi +``` + +## Verify installation + +```bash +mitm2openapi --version +``` + +## Shell completions + +`mitm2openapi` uses [clap](https://docs.rs/clap) for argument parsing. Shell completions +are not yet bundled, but you can generate them for most shells via `clap_complete` if building +from source. diff --git a/book/src/getting-started/quick-start.md b/book/src/getting-started/quick-start.md new file mode 100644 index 0000000..1f0db12 --- /dev/null +++ b/book/src/getting-started/quick-start.md @@ -0,0 +1,98 @@ +# Quick start + +This walkthrough takes you from a traffic capture to a complete OpenAPI spec in under a minute. + +## Prerequisites + +- `mitm2openapi` installed ([see installation](./installation.md)) +- A captured traffic file — either a mitmproxy `.flow` dump or a `.har` export from browser DevTools + +If you do not have a capture yet, see [capturing traffic](./capturing.md) for setup instructions. + +## Step 1: Discover endpoints + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" +``` + +This scans every request in `capture.flow` that matches the prefix `https://api.example.com` +and writes a templates file listing all observed URL paths. + +## Step 2: Curate the templates + +Open `templates.yaml`. Each path is prefixed with `ignore:` by default: + +```yaml +x-path-templates: +- ignore:/api/users +- ignore:/api/users/{id} +- ignore:/api/products +- ignore:/static/bundle.js +``` + +Remove the `ignore:` prefix from paths you want in the final spec: + +```yaml +x-path-templates: +- /api/users +- /api/users/{id} +- /api/products +- ignore:/static/bundle.js +``` + +Paths still prefixed with `ignore:` are excluded from the generated spec. + +## Step 3: Generate the OpenAPI spec + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" +``` + +The resulting `openapi.yaml` contains a valid OpenAPI 3.0 spec with paths, methods, +parameters, request bodies, and response schemas inferred from the captured traffic. + +## Skip the manual edit + +If you already know which paths matter, use glob filters to automate curation: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg' \ + --include-patterns '/api/**,/v2/**' + +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" +``` + +Paths matching `--include-patterns` are auto-activated (no `ignore:` prefix). Paths matching +`--exclude-patterns` are dropped entirely. Everything else still gets `ignore:` for manual +review. + +See [filtering endpoints](../usage/filtering.md) for the full glob syntax reference. + +## HAR files + +The same workflow works with HAR files — just point `-i` at a `.har` file. The format is +auto-detected: + +```bash +mitm2openapi discover \ + -i capture.har \ + -o templates.yaml \ + -p "https://api.example.com" +``` + +See [HAR files](../formats/har.md) for details on exporting HARs from browser DevTools. diff --git a/book/src/introduction.md b/book/src/introduction.md new file mode 100644 index 0000000..eb62e18 --- /dev/null +++ b/book/src/introduction.md @@ -0,0 +1,51 @@ +# Introduction + +**mitm2openapi** converts [mitmproxy](https://mitmproxy.org/) flow dumps and HAR files into +[OpenAPI 3.0](https://spec.openapis.org/oas/v3.0.3) specifications. It ships as a single +static binary — no Python, no virtual environment, no runtime dependencies. + +It is a Rust rewrite of [mitmproxy2swagger](https://github.com/alufers/mitmproxy2swagger) by +[@alufers](https://github.com/alufers), who pioneered the "capture traffic, extract API spec" +workflow. Credit to the original project for the idea and reference implementation. + +## Why? + +The Python original works well but requires Python, `pip`, and `mitmproxy` installed in the +environment. For CI pipelines, slim Docker images, security audits, and one-off usage, that +dependency chain is friction. + +`mitm2openapi` ships as a single ~5 MB static binary. Drop it into any environment and run. +Same OpenAPI 3.0 output, plus first-class HAR support and glob-based filters for fully +unattended pipelines. + +## Features + +- **Fast** — pure Rust, ~17× faster than the Python original ([benchmarks](./reference/benchmarks.md)) +- **Single static binary** — no Python, no venv, no pip, no runtime dependencies +- **Two-format support** — mitmproxy flow dumps (v19/v20/v21) and HAR 1.2 +- **Two-step workflow** — `discover` finds endpoints, you curate, `generate` emits OpenAPI 3.0 +- **Glob filters** — `--exclude-patterns` and `--include-patterns` for automated pipelines +- **Error recovery** — skips corrupt flows, continues processing +- **Auto-detection** — heuristic format detection from file content +- **Resource limits** — configurable caps prevent denial-of-service on untrusted input +- **Strict mode** — treat warnings as errors for CI gates +- **Structured reports** — `--report` outputs machine-readable JSON processing summaries +- **Battle-tested** — integration tests against Swagger Petstore and OWASP crAPI +- **Cross-platform** — Linux, macOS, Windows pre-built binaries + +## How it works + +The tool uses a two-step workflow: + +1. **Discover** — scan captured traffic and list all observed API endpoints +2. **Curate** — review the list and select which endpoints to include +3. **Generate** — produce a clean OpenAPI 3.0 spec from the selected endpoints + +This separates endpoint selection from spec generation, giving you full control over +what ends up in the final spec. + +## Next steps + +- [Install mitm2openapi](./getting-started/installation.md) +- [Run through the quick start](./getting-started/quick-start.md) +- [Learn about the full pipeline](./usage/pipeline.md) diff --git a/book/src/reference/benchmarks.md b/book/src/reference/benchmarks.md new file mode 100644 index 0000000..427d527 --- /dev/null +++ b/book/src/reference/benchmarks.md @@ -0,0 +1,7 @@ +# Performance & Benchmarks + +Results are regenerated weekly by the [benchmark workflow](https://github.com/Arkptz/mitm2openapi/blob/main/.github/workflows/bench.yml). See the workflow for the reproducible methodology. + +{{#include ../../../docs/benchmarks.md}} + +[s]: #timing diff --git a/book/src/reference/diagnostics.md b/book/src/reference/diagnostics.md new file mode 100644 index 0000000..753f3c6 --- /dev/null +++ b/book/src/reference/diagnostics.md @@ -0,0 +1,129 @@ +# Diagnostics + +<!-- toc --> + +`mitm2openapi` uses structured logging to report issues during processing. This chapter +covers how to interpret warnings, errors, and the structured report output. + +## Log levels + +Control verbosity with the `RUST_LOG` environment variable: + +```bash +# Default: warnings only +mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com" + +# More detail +RUST_LOG=info mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com" + +# Full debug output +RUST_LOG=debug mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com" +``` + +## Common warnings + +### Parse errors (tnetstring) + +``` +WARN TNetString parse error at byte 98304: unexpected end of input (148 flows parsed successfully) +``` + +This means the mitmproxy flow file contains corrupt data starting at byte 98,304. The +parser halts immediately and the remaining bytes in the file are **not** processed. The +148 flows parsed before the corruption are still emitted. + +**No resync is attempted.** Binary payloads can contain bytes that mimic valid tnetstring +length prefixes, so scanning forward would produce phantom flows with fabricated data. + +**What to do:** +- If the file was truncated during transfer, re-capture or re-download +- The 148 successfully parsed flows are still usable +- Use `--report` to capture the exact byte offset for debugging + +### Cap-fired events + +``` +WARN body size 68157440 exceeds cap 67108864, truncating +WARN header name exceeds 8192 bytes, dropping +WARN form field count 1247 exceeds cap 1000, ignoring excess +``` + +These indicate that a specific field in a flow exceeded the built-in or configured limit. +The affected field is truncated or dropped, but processing continues. + +**What to do:** +- Usually safe to ignore — the caps exist to prevent abuse, not normal traffic +- If you need the full data, increase the relevant `--max-*` flag +- Use `--strict` to fail on these if you need guaranteed completeness + +### Flow rejection events + +``` +WARN skipping flow: scheme "javascript" not in whitelist [http, https] +WARN skipping flow: invalid UTF-8 in host field +WARN skipping flow: port 0 out of valid range 1-65535 +``` + +These mean an entire flow was skipped because it failed validation. + +**What to do:** +- Non-HTTP flows (WebSocket upgrades, CONNECT tunnels) are expected to be skipped +- UTF-8 errors suggest the capture contains binary protocol data, not HTTP traffic +- Invalid port/status usually indicates corrupt flow data + +## Structured reports + +For machine-readable diagnostics, use `--report`: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --report report.json +``` + +See [processing reports](../usage/reports.md) for the full JSON schema. + +### Event categories in reports + +| Category | Examples | +|----------|---------| +| `parse_error` | Tnetstring corruption, HAR JSON syntax errors | +| `cap_fired` | Body too large, depth exceeded, form field count exceeded | +| `rejected` | Invalid scheme, non-UTF-8 identity fields, bad port/status | + +### Using reports in CI + +```bash +# Fail if any parse errors occurred +if jq -e '.events.parse_error | length > 0' report.json > /dev/null 2>&1; then + echo "Parse errors detected" + exit 1 +fi + +# Check flows-read vs flows-emitted ratio +RATIO=$(jq '.result.flows_emitted / .result.flows_read' report.json) +if (( $(echo "$RATIO < 0.9" | bc -l) )); then + echo "Warning: more than 10% of flows were dropped" +fi +``` + +## Strict mode interaction + +With `--strict`, any warning-level event causes exit code 2. This converts the +"informational" diagnostics above into hard failures: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --strict \ + --report report.json + +# Exit code 2 if ANY warning was emitted +# report.json still written for post-mortem +``` + +See [strict mode](../usage/strict-mode.md) for details. diff --git a/book/src/reference/security.md b/book/src/reference/security.md new file mode 100644 index 0000000..7417ef4 --- /dev/null +++ b/book/src/reference/security.md @@ -0,0 +1,96 @@ +# Security model + +<!-- toc --> + +`mitm2openapi` processes untrusted binary input (traffic captures from unknown sources). +The security model is designed to prevent denial-of-service, data corruption, and +information leakage when handling adversarial input. + +## Threat model + +The primary threat is a **malicious capture file** — a `.flow` or `.har` file crafted to +exploit the parser. Scenarios include: + +- CI pipelines processing captures from untrusted contributors +- Shared analysis servers where multiple users submit captures +- Automated pipelines where the capture source is not fully controlled + +## Input validation layers + +### File-level checks + +Before reading any content: + +1. **File type** — only regular files are accepted. Symlinks, FIFOs, device files, and + directories are rejected unless `--allow-symlinks` is explicitly set. +2. **File size** — files exceeding `--max-input-size` (default 2 GiB) are rejected before + any bytes are read. +3. **TOCTOU caveat** — file metadata is checked via the path before reading to reject + symlinks, non-regular files, and oversized inputs. There is a small TOCTOU window + between the metadata check and the file open; mitigation via fd-based recheck after + open is a future enhancement. + +### Parser-level caps + +During parsing: + +| Cap | Default | Purpose | +|-----|---------|---------| +| Payload size | 256 MiB | Prevents OOM from oversized tnetstring values | +| Nesting depth | 256 | Prevents stack overflow from deeply nested structures | +| JSON depth | 64 | Prevents stack overflow in schema inference | +| Body size | 64 MiB | Limits memory for individual request/response bodies | + +These caps trigger `warn`-level events and skip the affected data. Use `--strict` to +treat them as hard errors. + +### Field-level validation + +For every flow: + +- **Scheme whitelist** — only `http` and `https` are accepted. Other schemes (e.g., + `javascript:`, `data:`) are silently skipped. +- **UTF-8 strictness** — identity fields (method, scheme, host, path, header names) must be + valid UTF-8. Invalid bytes cause the flow to be skipped, preventing data aliasing through + replacement-character collisions. +- **Port range** — port numbers must be 1--65,535. Out-of-range values drop the request. +- **Status code range** — HTTP status codes must be 100--599. +- **Control character stripping** — `0x00`--`0x1F` and `0x7F` in URL paths are removed. +- **Header caps** — header names over 8 KiB are dropped; values over 64 KiB are truncated. +- **Form field count** — at most 1,000 form fields per request are processed. + +### Output safety + +- **Atomic writes** — output files are written via a temporary file and renamed. If the write + fails (disk full, permission denied), the target path is left untouched. +- **No resync on corruption** — when the tnetstring parser encounters corrupt data, it halts + immediately. It does not scan forward looking for the next valid frame, because binary + payloads can contain bytes that look like valid length prefixes. + +## Streaming architecture + +Both mitmproxy and HAR inputs are processed incrementally. At no point is the entire capture +loaded into memory. This bounds peak RSS to the size of the largest single flow, regardless +of total file size. + +## Glob pattern safety + +The `--exclude-patterns` and `--include-patterns` flags use the +[globset](https://docs.rs/globset) crate, which compiles patterns into a DFA. This eliminates +exponential backtracking that was possible with the original recursive glob matcher. + +## Recommendations + +For processing untrusted captures: + +1. Do not use `--allow-symlinks` unless you control the filesystem +2. Keep `--max-input-size` at the default (2 GiB) or lower +3. Run with `--strict` to fail fast on any anomaly +4. Use `--report` to capture processing diagnostics for audit trails +5. Run in a sandboxed environment (container, VM) when processing captures from unknown sources + +## Related + +- [Resource limits](../usage/resource-limits.md) — configuring the caps +- [Strict mode](../usage/strict-mode.md) — CI enforcement +- [Diagnostics](./diagnostics.md) — interpreting warnings and errors diff --git a/book/src/usage/cli-reference.md b/book/src/usage/cli-reference.md new file mode 100644 index 0000000..5b1ca06 --- /dev/null +++ b/book/src/usage/cli-reference.md @@ -0,0 +1,120 @@ +# CLI reference + +<!-- toc --> + +```admonish warning +This reference was last synced with `mitm2openapi --help` at version 0.5.1. +If you notice a flag missing from your local `--help` output, the tool may be ahead of these +docs. [Open an issue](https://github.com/Arkptz/mitm2openapi/issues/new) to prompt an update. +``` + +## `mitm2openapi discover` + +Scan captured traffic and produce a templates file listing all observed endpoints. + +``` +mitm2openapi discover [OPTIONS] -i <INPUT> -o <OUTPUT> -p <PREFIX> +``` + +### Required arguments + +| Option | Description | +|--------|-------------| +| `-i, --input <PATH>` | Input file (flow dump or HAR) | +| `-o, --output <PATH>` | Output YAML templates file | +| `-p, --prefix <URL>` | API prefix URL to filter requests | + +### Optional arguments + +| Option | Default | Description | +|--------|---------|-------------| +| `--format <FORMAT>` | `auto` | Input format: `auto`, `har`, `mitmproxy` | +| `--exclude-patterns <GLOBS>` | | Comma-separated globs; matching paths are dropped entirely | +| `--include-patterns <GLOBS>` | | Comma-separated globs; matching paths are auto-activated | +| `--max-input-size <BYTES>` | `2GiB` | Maximum input file size. Accepts `KiB`, `MiB`, `GiB` suffixes | +| `--allow-symlinks` | off | Allow symlinked input files | +| `--strict` | off | Treat warnings as errors (exit code 2) | +| `--report <PATH>` | | Write structured JSON processing report | + +## `mitm2openapi generate` + +Generate an OpenAPI 3.0 spec from captured traffic using a curated templates file. + +``` +mitm2openapi generate [OPTIONS] -i <INPUT> -t <TEMPLATES> -o <OUTPUT> -p <PREFIX> +``` + +### Required arguments + +| Option | Description | +|--------|-------------| +| `-i, --input <PATH>` | Input file (flow dump or HAR) | +| `-t, --templates <PATH>` | Templates YAML file (from `discover`) | +| `-o, --output <PATH>` | Output OpenAPI YAML file | +| `-p, --prefix <URL>` | API prefix URL | + +### Optional arguments + +| Option | Default | Description | +|--------|---------|-------------| +| `--format <FORMAT>` | `auto` | Input format: `auto`, `har`, `mitmproxy` | +| `--openapi-title <TITLE>` | | Custom title for the spec | +| `--openapi-version <VER>` | `1.0.0` | Custom spec version | +| `--exclude-headers <LIST>` | | Comma-separated headers to exclude from spec | +| `--exclude-cookies <LIST>` | | Comma-separated cookies to exclude from spec | +| `--include-headers` | off | Include request headers in the spec | +| `--ignore-images` | off | Ignore image content types | +| `--suppress-params` | off | Suppress parameter suggestions | +| `--tags-overrides <JSON>` | | JSON string for tag overrides | +| `--max-input-size <BYTES>` | `2GiB` | Maximum input file size | +| `--max-payload-size <BYTES>` | `256MiB` | Maximum tnetstring payload size | +| `--max-depth <N>` | `256` | Maximum tnetstring nesting depth | +| `--max-body-size <BYTES>` | `64MiB` | Maximum request/response body size | +| `--allow-symlinks` | off | Allow symlinked input files | +| `--strict` | off | Treat warnings as errors (exit code 2) | +| `--report <PATH>` | | Write structured JSON processing report | + +## Common flag details + +### `--format` + +By default, the input format is auto-detected from a combination of file extension and +content sniffing: +- `.flow` extension or content starting with a tnetstring length prefix → mitmproxy format +- `.har` extension or content starting with `{` → HAR format + +Use `--format mitmproxy` or `--format har` to override auto-detection. + +### `--prefix` + +The prefix URL filters which requests are processed. Only requests whose URL starts with +the prefix are included. The prefix is stripped from paths in the generated spec. + +Example: with `--prefix https://api.example.com`, a request to +`https://api.example.com/users/42` produces path `/users/42` in the spec. + +### `--strict` + +See [strict mode](./strict-mode.md) for details on exit codes and CI usage. + +### `--report` + +See [processing reports](./reports.md) for the JSON schema and CI integration examples. + +## Exit codes + +| Code | Meaning | +|------|---------| +| 0 | Success | +| 1 | Fatal error (I/O failure, missing arguments, invalid input) | +| 2 | Strict mode violation (warnings with `--strict` enabled) | + +## Environment variables + +| Variable | Description | +|----------|-------------| +| `RUST_LOG` | Controls log verbosity. Default: `warn`. Set to `info` or `debug` for more output. | + +```bash +RUST_LOG=info mitm2openapi discover -i capture.flow -o templates.yaml -p "https://api.example.com" +``` diff --git a/book/src/usage/filtering.md b/book/src/usage/filtering.md new file mode 100644 index 0000000..266b76e --- /dev/null +++ b/book/src/usage/filtering.md @@ -0,0 +1,96 @@ +# Filtering endpoints + +<!-- toc --> + +The `discover` command supports glob-based filters to automate endpoint curation. +This is useful for CI pipelines or large captures where manual editing is impractical. + +## Glob syntax + +Filters use git-style glob patterns (powered by the [`globset`](https://docs.rs/globset) crate): + +| Pattern | Matches | Does not match | +|---------|---------|----------------| +| `*` | Single path segment | Segments with `/` | +| `**` | Any number of path segments | (matches everything) | +| `?` | Any single character | | +| `[abc]` | Character class | | +| `{a,b}` | Alternation | | + +## `--exclude-patterns` + +Paths matching any exclude glob are **dropped entirely** — they do not appear in the +templates file at all. + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --exclude-patterns '/static/**,/images/**,*.css,*.js,*.svg,*.png' +``` + +Multiple patterns are comma-separated. A path is excluded if it matches **any** pattern. + +## `--include-patterns` + +Paths matching any include glob are emitted **without the `ignore:` prefix** — they are +auto-activated for the `generate` step. + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --include-patterns '/api/**,/v2/**' +``` + +## Combining filters + +When both are specified: + +1. **Exclude runs first** — matching paths are dropped entirely +2. **Include runs second** — matching paths among the survivors are auto-activated +3. **Everything else** gets the `ignore:` prefix for manual review + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --exclude-patterns '/static/**,*.css,*.js' \ + --include-patterns '/api/**' +``` + +Result: +- `/static/bundle.js` — excluded (dropped) +- `/api/users` — included (auto-activated) +- `/dashboard` — neither matched (gets `ignore:` prefix) + +## Examples + +### API-only spec + +```bash +--include-patterns '/api/**' \ +--exclude-patterns '/api/internal/**,/api/debug/**' +``` + +### Strip static assets + +```bash +--exclude-patterns '/static/**,/assets/**,*.css,*.js,*.svg,*.png,*.jpg,*.gif,*.ico,*.woff,*.woff2' +``` + +### Multiple API versions + +```bash +--include-patterns '/v1/**,/v2/**,/v3/**' +``` + +## Pattern tips + +- Patterns match against the **URL path only** (after the prefix is stripped), not the full URL +- Leading `/` is recommended for clarity but not required +- Patterns are case-sensitive +- Use `**` sparingly — it matches everything, including deeply nested paths diff --git a/book/src/usage/pipeline.md b/book/src/usage/pipeline.md new file mode 100644 index 0000000..a50b0bd --- /dev/null +++ b/book/src/usage/pipeline.md @@ -0,0 +1,200 @@ +# Discover, curate, generate + +<!-- toc --> + +`mitm2openapi` uses a three-step pipeline to convert captured HTTP traffic into an OpenAPI +specification. This chapter explains each step in detail. + +## Overview + +```mermaid +graph LR + A[Traffic capture] --> B[discover] + B --> C[Templates file] + C --> D[Curate] + D --> E[generate] + E --> F[OpenAPI 3.0 spec] +``` + +The pipeline separates **endpoint discovery** from **spec generation**, giving you an explicit +curation step where you choose which endpoints appear in the final spec. + +## Step 1: Discover + +The `discover` command scans a traffic capture and extracts all unique URL paths that match +a given prefix. + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" +``` + +### What happens internally + +1. The input file is read incrementally (streaming — memory usage stays bounded) +2. Each request's URL is checked against the `--prefix` filter +3. Matching paths are collected and deduplicated +4. Path segments that look like IDs (UUIDs, numeric strings) are replaced with + `{id}` placeholders (or `{id1}`, `{id2}`, ... when a path has multiple parameters) +5. The result is written to the templates file + +### Templates file format + +The output is a YAML file with path templates under an `x-path-templates` key: + +```yaml +x-path-templates: +- ignore:/api/users +- ignore:/api/users/{id} +- ignore:/api/products +- ignore:/api/products/{id}/reviews +- ignore:/static/bundle.js +``` + +Every path is prefixed with `ignore:` by default. This is intentional — it forces you to +explicitly opt in to each endpoint. + +### Automatic parameterization + +The discover step detects path segments that vary across requests and replaces them with +named parameters: + +| Observed paths | Template | +|---|---| +| `/api/users/42`, `/api/users/99` | `/api/users/{id}` | +| `/api/orders/abc-def-123` | `/api/orders/{id}` | + +UUID-like and numeric segments are detected automatically. More complex patterns require +manual editing of the templates file. + +## Step 2: Curate + +Open the templates file in any text editor. For each path: + +- **Remove `ignore:`** to include the endpoint in the generated spec +- **Leave `ignore:`** to exclude it +- **Delete the line** to exclude it permanently + +```yaml +# Before curation +x-path-templates: +- ignore:/api/users +- ignore:/api/users/{id} +- ignore:/static/bundle.js + +# After curation +x-path-templates: +- /api/users +- /api/users/{id} +- ignore:/static/bundle.js +``` + +You can also edit parameter names. The default `{id}` placeholder can be renamed to +something more descriptive like `{userId}`: + +```yaml +- /api/users/{userId} +``` + +### Automating curation with glob filters + +For CI pipelines or large captures, manual curation is impractical. Use `--include-patterns` +and `--exclude-patterns` during the `discover` step instead: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --include-patterns '/api/**' \ + --exclude-patterns '/static/**,*.css,*.js' +``` + +Paths matching `--include-patterns` are emitted without the `ignore:` prefix (auto-activated). +Paths matching `--exclude-patterns` are dropped entirely. Everything else gets `ignore:` for +manual review. + +See [filtering endpoints](./filtering.md) for the full glob syntax. + +## Step 3: Generate + +The `generate` command re-reads the traffic capture and produces an OpenAPI spec using the +curated templates as a guide: + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" +``` + +### What happens internally + +1. The templates file is loaded and the `ignore:` entries are filtered out +2. Each template path is compiled into a regex for matching +3. The traffic capture is streamed again, matching each request against the templates +4. For each matched request: + - Path parameters are extracted + - Query parameters are collected + - Request body schema is inferred (JSON, form data) + - Response status code and body schema are recorded +5. When multiple requests match the same template, their schemas are merged: + - Different status codes (200, 400, 404) produce separate response entries + - Request body is taken from the first observation; subsequent same-endpoint + observations only contribute response schemas +6. The final OpenAPI 3.0 document is written as YAML + +### Customizing output + +The `generate` command accepts several options to tune the output: + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" \ + --openapi-title "My API" \ + --openapi-version "2.0.0" \ + --exclude-headers "authorization,cookie" \ + --ignore-images +``` + +See the [CLI reference](./cli-reference.md) for all available options. + +## Worked example + +Starting from a mitmproxy capture of a pet store API: + +```bash +# Discover all endpoints under the API prefix +mitm2openapi discover \ + -i petstore.flow \ + -o templates.yaml \ + -p "http://petstore:8080" \ + --exclude-patterns '/static/**' \ + --include-patterns '/api/**' + +# Templates file now has API paths auto-activated: +# - /api/v3/pet +# - /api/v3/pet/{id} +# - /api/v3/pet/findByStatus +# - /api/v3/store/inventory +# - ignore:/static/swagger-ui.css + +# Generate the spec +mitm2openapi generate \ + -i petstore.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "http://petstore:8080" + +# Result: openapi.yaml with paths, methods, schemas +``` + +The generated `openapi.yaml` is a valid OpenAPI 3.0 document that can be opened in +[Swagger UI](https://github.com/swagger-api/swagger-ui), imported into Postman, or used +as a contract for API testing. diff --git a/book/src/usage/reports.md b/book/src/usage/reports.md new file mode 100644 index 0000000..15a3bab --- /dev/null +++ b/book/src/usage/reports.md @@ -0,0 +1,98 @@ +# Processing reports + +Pass `--report <PATH>` to either `discover` or `generate` to write a JSON processing +summary. This is useful for CI pipelines that need structured data instead of log scraping. + +## Usage + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --report report.json +``` + +## Report schema + +```json +{ + "report_version": 1, + "tool_version": "0.5.1", + "input": { + "path": "capture.flow", + "format": "Auto", + "size_bytes": 102400 + }, + "result": { + "flows_read": 150, + "flows_emitted": 148, + "paths_in_spec": 12 + }, + "events": { + "parse_error": { + "TNetString parse error at byte 98304: unexpected end of input": 1 + } + } +} +``` + +### Fields + +| Field | Type | Description | +|-------|------|-------------| +| `report_version` | integer | Schema version (currently `1`) | +| `tool_version` | string | `mitm2openapi` version that produced the report | +| `input.path` | string | Input file path | +| `input.format` | string | Detected or specified format (`Auto`, `Mitmproxy`, `Har`) | +| `input.size_bytes` | integer | Input file size in bytes | +| `result.flows_read` | integer | Total flows/entries parsed from input | +| `result.flows_emitted` | integer | Flows that passed all filters and were processed | +| `result.paths_in_spec` | integer | Unique paths in the output (for `generate`) | +| `events` | object | Map of event categories to message counts | + +### Event categories + +| Category | Meaning | Status | +|----------|---------|--------| +| `parse_error` | Corrupt data encountered (tnetstring errors, malformed HAR entries) | Populated | +| `cap_fired` | A resource limit was triggered (body too large, depth exceeded) | Reserved — not yet populated at runtime | +| `rejected` | A flow was skipped (invalid UTF-8, unsupported scheme, bad port/status) | Reserved — not yet populated at runtime | + +The `cap_fired` and `rejected` categories are present in the report schema and will be +connected to the reader pipelines in a future release. Currently, only `parse_error` +events are counted. + +## CI integration + +Parse the report in CI to make decisions based on processing quality: + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" \ + --report report.json + +# Check if any events occurred +if jq -e '.events | length > 0' report.json > /dev/null 2>&1; then + echo "Warning: processing had events" + jq '.events' report.json +fi +``` + +## Report with strict mode + +The report is written even when `--strict` causes a non-zero exit code. This lets you +capture full diagnostics while still failing the CI job: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --strict \ + --report report.json \ + || { jq '.' report.json; exit 1; } +``` diff --git a/book/src/usage/resource-limits.md b/book/src/usage/resource-limits.md new file mode 100644 index 0000000..e11985e --- /dev/null +++ b/book/src/usage/resource-limits.md @@ -0,0 +1,91 @@ +# Resource limits + +<!-- toc --> + +To prevent denial-of-service when processing untrusted captures, `mitm2openapi` enforces +several configurable and fixed limits. + +## Configurable limits + +These limits can be adjusted via CLI flags: + +| Flag | Default | Purpose | +|------|---------|---------| +| `--max-input-size` | 2 GiB | Reject files larger than this before reading | +| `--max-payload-size` | 256 MiB | Cap on individual tnetstring payload allocation | +| `--max-depth` | 256 | Recursion depth limit for nested tnetstring structures | +| `--max-body-size` | 64 MiB | Maximum request/response body considered during schema inference | +| `--allow-symlinks` | off | By default, symlinked inputs are rejected | + +### Adjusting limits + +Increase `--max-input-size` if you work with captures larger than 2 GiB: + +```bash +mitm2openapi discover \ + -i large-capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --max-input-size 8GiB +``` + +Size suffixes are supported: `KiB`, `MiB`, `GiB`. + +The other limits rarely need tuning. The defaults are designed to handle real-world +captures while rejecting pathological inputs. + +### Symlink rejection + +By default, symlinked input files are rejected to prevent path-traversal attacks on shared +CI runners. If you need to process a symlinked file: + +```bash +mitm2openapi discover \ + -i /path/to/symlinked-capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --allow-symlinks +``` + +## Fixed per-field limits + +These limits are applied unconditionally and cannot be changed via CLI flags: + +| Field | Cap | Behaviour when exceeded | +|-------|-----|------------------------| +| Header name | 8 KiB | Header dropped (other headers still processed) | +| Header value | 64 KiB | Value truncated to cap | +| Form fields per request | 1,000 | Excess fields ignored | +| URL scheme | `http` / `https` only | Non-HTTP flows silently skipped | +| Port number | 1 -- 65,535 | Out-of-range port drops the request | +| HTTP status code | 100 -- 599 | Invalid codes treated as no response | + +## UTF-8 validation + +Identity fields (scheme, host, path, method, header names) require valid UTF-8. Flows +with non-UTF-8 identity bytes are skipped to prevent data aliasing through +replacement-character collisions. + +Control characters (`0x00`--`0x1F`, `0x7F`) in paths are stripped automatically. + +## Streaming and memory + +Both mitmproxy flow files and HAR files are processed incrementally. Memory usage stays +bounded regardless of input size — there is no need to load the entire capture into memory. + +Peak RSS is proportional to the size of the **largest single flow** in the capture, not the +total file size. For typical captures, expect 5--15 MB of memory usage. + +## When limits fire + +When a per-field limit is exceeded (header too large, body too large, form fields over cap), +the affected field is skipped or truncated and processing continues with the remaining data. + +When a tnetstring parse error occurs, the iterator halts and the rest of the file is not +processed — valid flows parsed before the error are still emitted. There is no resync +because binary payloads can contain bytes that mimic valid length prefixes. + +In both cases a `warn`-level log message is emitted with details. + +Use [strict mode](./strict-mode.md) to treat these warnings as errors, or +[processing reports](./reports.md) to capture them as structured data. diff --git a/book/src/usage/strict-mode.md b/book/src/usage/strict-mode.md new file mode 100644 index 0000000..475c3fd --- /dev/null +++ b/book/src/usage/strict-mode.md @@ -0,0 +1,79 @@ +# Strict mode + +Pass `--strict` to either `discover` or `generate` to treat warning-level events as +hard failures. The process exits with code **2** if the processing report records any +counted events. + +Currently, the only event counter populated at runtime is `parse_error` — triggered when +flows cannot be deserialized (corrupt tnetstring data, malformed HAR JSON). The +`cap_fired` and `rejected` counters exist in the report schema but are not yet wired to +the reader pipelines; they will be connected in a future release. + +In practice, `--strict` today catches: + +- Parse errors during flow deserialization (tnetstring or HAR) +- Errors counted by the streaming iterator wrapper in `discover` mode + +## Usage + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --strict +``` + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" \ + --strict +``` + +## CI usage pattern + +Strict mode is designed for CI gates where silent degradation is unacceptable: + +```bash +mitm2openapi discover \ + -i capture.flow \ + -o templates.yaml \ + -p "https://api.example.com" \ + --strict \ + || { echo "FAIL: corrupt or over-limit flows detected"; exit 1; } +``` + +## Without `--strict` + +Without the flag, parse errors are logged at `warn` level and processing continues with +exit code 0. Affected flows are skipped, but the output file is still produced. Other +warning-level events (cap fires, scheme rejections, etc.) are always logged but do not +currently increment the report counters that `--strict` checks. + +## Exit codes + +| Code | Meaning | +|------|---------| +| 0 | Success (no warnings, or `--strict` not set) | +| 1 | Fatal error (I/O failure, missing required arguments) | +| 2 | Strict mode violation (warnings detected with `--strict`) | + +## Combining with reports + +For CI pipelines that need both strict enforcement and structured diagnostics: + +```bash +mitm2openapi generate \ + -i capture.flow \ + -t templates.yaml \ + -o openapi.yaml \ + -p "https://api.example.com" \ + --strict \ + --report report.json +``` + +The [report](./reports.md) is written even when `--strict` causes a non-zero exit, capturing +the full details of what went wrong. diff --git a/docs/benchmarks.md b/docs/benchmarks.md index 1a53a08..7830a65 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -1,8 +1,6 @@ -# Benchmarks - Generated by the [benchmark workflow](.github/workflows/bench.yml). -# Benchmark results +## Benchmark results _Run: 2026-04-22 22:31 UTC, commit `22ef2faa`, runner: Linux 6.17.0-1011-azure_ diff --git a/src/har_reader.rs b/src/har_reader.rs index 64d5f30..980209f 100644 --- a/src/har_reader.rs +++ b/src/har_reader.rs @@ -10,6 +10,9 @@ use crate::error::{Error, Result}; use crate::types::CapturedRequest; use crate::MAX_BODY_SIZE; +const MAX_HEADER_NAME_SIZE: usize = 8 * 1024; +const MAX_HEADER_VALUE_SIZE: usize = 64 * 1024; + #[derive(Deserialize)] struct StreamingHarEntry { request: StreamingHarRequest, @@ -112,27 +115,50 @@ impl HarFlowWrapper { Some(Self { url: entry.request.url, method: entry.request.method, - request_headers: entry - .request - .headers - .into_iter() - .map(|h| (h.name, h.value)) - .collect(), + request_headers: cap_headers(entry.request.headers), request_body, response_status, response_reason: entry.response.status_text, - response_headers: entry - .response - .headers - .into_iter() - .map(|h| (h.name, h.value)) - .collect(), + response_headers: cap_headers(entry.response.headers), response_body, response_content_type, }) } } +fn cap_headers(headers: Vec<StreamingHarHeader>) -> Vec<(String, String)> { + headers + .into_iter() + .filter_map(|h| { + if h.name.len() > MAX_HEADER_NAME_SIZE { + warn!( + event = "header_name_too_large", + size = h.name.len(), + max = MAX_HEADER_NAME_SIZE, + "dropping HAR header with oversized name" + ); + return None; + } + let value = if h.value.len() > MAX_HEADER_VALUE_SIZE { + warn!( + event = "header_value_too_large", + size = h.value.len(), + max = MAX_HEADER_VALUE_SIZE, + name = %h.name, + "truncating oversized HAR header value" + ); + h.value + .get(..MAX_HEADER_VALUE_SIZE) + .unwrap_or(&h.value) + .to_string() + } else { + h.value + }; + Some((h.name, value)) + }) + .collect() +} + fn cap_body(body: Vec<u8>) -> Vec<u8> { if body.len() > MAX_BODY_SIZE { warn!( @@ -432,6 +458,14 @@ pub fn stream_har_file(path: &Path) -> Result<RequestIter> { } fn stream_har_dir(path: &Path) -> Result<RequestIter> { + stream_har_dir_inner(path, false) +} + +pub fn stream_har_dir_no_symlinks(path: &Path) -> Result<RequestIter> { + stream_har_dir_inner(path, true) +} + +fn stream_har_dir_inner(path: &Path, reject_symlinks: bool) -> Result<RequestIter> { let mut dir_entries: Vec<_> = std::fs::read_dir(path)? .filter_map(|e| match e { Ok(entry) => Some(entry), @@ -449,6 +483,23 @@ fn stream_har_dir(path: &Path) -> Result<RequestIter> { .extension() .is_some_and(|ext| ext.eq_ignore_ascii_case("har")) }) + .filter(|e| { + if reject_symlinks { + match e.path().symlink_metadata() { + Ok(meta) if meta.file_type().is_symlink() => { + warn!( + event = "symlink_rejected", + path = %e.path().display(), + "skipping symlinked HAR directory entry" + ); + false + } + _ => true, + } + } else { + true + } + }) .collect(); dir_entries.sort_by_key(|e| e.path()); diff --git a/src/main.rs b/src/main.rs index d4e80f1..a607332 100644 --- a/src/main.rs +++ b/src/main.rs @@ -274,16 +274,32 @@ fn stream_input( max_input_size: u64, allow_symlinks: bool, ) -> Result<RequestIter> { + // Check symlink-ness before is_dir(), since is_dir() follows symlinks. + if !allow_symlinks { + if let Ok(meta) = path.symlink_metadata() { + if meta.file_type().is_symlink() { + return Err(mitm2openapi::error::Error::SymlinkRejected { + path: path.to_path_buf(), + } + .into()); + } + } + } if !path.is_dir() { mitm2openapi::validate_input_path(path, max_input_size, allow_symlinks) .context("input file validation failed")?; } + let reject_symlinks = !allow_symlinks; match format { InputFormat::Mitmproxy => { debug!(path = %path.display(), "Streaming as mitmproxy format"); if path.is_dir() { - mitmproxy_reader::stream_mitmproxy_dir(path) - .context("failed to stream mitmproxy directory") + if reject_symlinks { + mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(path) + } else { + mitmproxy_reader::stream_mitmproxy_dir(path) + } + .context("failed to stream mitmproxy directory") } else { let iter = mitmproxy_reader::stream_mitmproxy_file(path) .context("failed to stream mitmproxy file")?; @@ -298,8 +314,16 @@ fn stream_input( InputFormat::Auto => { if path.is_dir() { debug!(path = %path.display(), "Auto-detecting format for directory"); - let mitmproxy_result = mitmproxy_reader::stream_mitmproxy_dir(path); - let har_result = har_reader::stream_har_file(path); + let mitmproxy_result = if reject_symlinks { + mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(path) + } else { + mitmproxy_reader::stream_mitmproxy_dir(path) + }; + let har_result = if reject_symlinks { + har_reader::stream_har_dir_no_symlinks(path) + } else { + har_reader::stream_har_file(path) + }; match (mitmproxy_result, har_result) { (Ok(m_iter), Ok(h_iter)) => { diff --git a/src/mitmproxy_reader.rs b/src/mitmproxy_reader.rs index 7076e67..c0cc7d6 100644 --- a/src/mitmproxy_reader.rs +++ b/src/mitmproxy_reader.rs @@ -361,6 +361,14 @@ pub fn stream_mitmproxy_file( } pub fn stream_mitmproxy_dir(path: &Path) -> Result<RequestIter> { + stream_mitmproxy_dir_inner(path, false) +} + +pub fn stream_mitmproxy_dir_no_symlinks(path: &Path) -> Result<RequestIter> { + stream_mitmproxy_dir_inner(path, true) +} + +fn stream_mitmproxy_dir_inner(path: &Path, reject_symlinks: bool) -> Result<RequestIter> { let mut entries: Vec<_> = std::fs::read_dir(path)? .filter_map(|e| match e { Ok(entry) => Some(entry), @@ -378,6 +386,23 @@ pub fn stream_mitmproxy_dir(path: &Path) -> Result<RequestIter> { .extension() .is_some_and(|ext| ext.eq_ignore_ascii_case("flow")) }) + .filter(|e| { + if reject_symlinks { + match e.path().symlink_metadata() { + Ok(meta) if meta.file_type().is_symlink() => { + warn!( + event = "symlink_rejected", + path = %e.path().display(), + "skipping symlinked directory entry" + ); + false + } + _ => true, + } + } else { + true + } + }) .collect(); entries.sort_by_key(|e| e.path()); diff --git a/tests/security.rs b/tests/security.rs index 1d9248c..a897b19 100644 --- a/tests/security.rs +++ b/tests/security.rs @@ -79,3 +79,87 @@ fn normal_file_passes_validation() { let result = mitm2openapi::validate_input_path(&path, mitm2openapi::MAX_INPUT_SIZE, false); assert!(result.is_ok(), "normal file should pass: {result:?}"); } + +#[cfg(unix)] +#[test] +fn symlink_to_directory_rejected() { + use std::os::unix::fs as unix_fs; + + let dir = TempDir::new().unwrap(); + let real_dir = dir.path().join("real_dir"); + std::fs::create_dir(&real_dir).unwrap(); + std::fs::write(real_dir.join("test.flow"), b"1:X,").unwrap(); + + let link = dir.path().join("link_dir"); + unix_fs::symlink(&real_dir, &link).unwrap(); + + assert!(link.is_dir(), "symlink should resolve to directory"); + + let err = mitm2openapi::validate_input_path(&link, mitm2openapi::MAX_INPUT_SIZE, false); + assert!( + matches!(err, Err(mitm2openapi::error::Error::SymlinkRejected { .. })), + "symlink to directory should be rejected, got {err:?}" + ); +} + +#[cfg(unix)] +#[test] +fn symlink_dir_entry_rejected_in_mitmproxy() { + use std::os::unix::fs as unix_fs; + + let dir = TempDir::new().unwrap(); + let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("testdata") + .join("flows") + .join("simple_get.flow"); + let real_file = dir.path().join("real.flow"); + std::fs::copy(&src, &real_file).unwrap(); + + let link_file = dir.path().join("linked.flow"); + unix_fs::symlink(&real_file, &link_file).unwrap(); + + let iter = mitm2openapi::mitmproxy_reader::stream_mitmproxy_dir_no_symlinks(dir.path()); + assert!(iter.is_ok(), "should open directory"); + let results: Vec<_> = iter.unwrap().filter_map(|r| r.ok()).collect(); + + assert!( + !results.is_empty(), + "real file should produce at least one flow" + ); + + let all_results: Vec<_> = mitm2openapi::mitmproxy_reader::stream_mitmproxy_dir(dir.path()) + .unwrap() + .filter_map(|r| r.ok()) + .collect(); + assert!( + all_results.len() > results.len(), + "without symlink rejection, both files should be processed" + ); +} + +#[cfg(unix)] +#[test] +fn symlink_dir_entry_rejected_in_har() { + use std::os::unix::fs as unix_fs; + + let dir = TempDir::new().unwrap(); + let src = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("testdata") + .join("har") + .join("simple.har"); + let real_file = dir.path().join("real.har"); + std::fs::copy(&src, &real_file).unwrap(); + + let link_file = dir.path().join("linked.har"); + unix_fs::symlink(&real_file, &link_file).unwrap(); + + let iter = mitm2openapi::har_reader::stream_har_dir_no_symlinks(dir.path()); + assert!(iter.is_ok(), "should open directory"); + let results: Vec<_> = iter.unwrap().filter_map(|r| r.ok()).collect(); + + assert_eq!( + results.len(), + 1, + "only the real HAR file should be processed, symlinked entry skipped" + ); +}