Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,55 @@ jobs:
! grep -q '&lt;br&gt;' "$temp/out.html" || { echo 'FAIL: <br> was escaped instead of literal'; exit 1; }
echo 'HTML escaping regression check OK'
rm -rf "$temp"
- run: |
temp=$(mktemp -d)
cat > "$temp/in.md" << 'INEOF'
# Test

Safe text.

<script>alert(1)</script>
<iframe src="https://evil.com"></iframe>
<img src="x" onerror="alert(1)">
<a href="javascript:alert(1)">bad</a>
INEOF
python3 scripts/markdown_to_html.py "$temp/in.md" "$temp/out.html"
! grep -qi '<script\|<iframe\|onerror=' "$temp/out.html" || { echo 'FAIL: body HTML not sanitized (script/iframe/onerror)'; exit 1; }
! grep -qi 'javascript:' "$temp/out.html" || { echo 'FAIL: javascript: URL not sanitized'; exit 1; }
! grep -qi 'style=' "$temp/out.html" || { echo 'FAIL: inline style not removed'; exit 1; }
! grep -qi '<img' "$temp/out.html" || { echo 'FAIL: img tag not stripped'; exit 1; }
grep -q 'Safe text' "$temp/out.html" || { echo 'FAIL: safe content lost'; exit 1; }
echo 'HTML sanitization check OK'
rm -rf "$temp"
- run: |
temp=$(mktemp -d)
cat > "$temp/in.md" << 'INEOF'
# CSS/Style

<span style="background:url(javascript:alert(1))">css-url</span>
<div style="position:fixed;top:0">fixed</div>
<span style="color:red">safe style</span>
INEOF
python3 scripts/markdown_to_html.py "$temp/in.md" "$temp/out.html"
! grep -q 'style=' "$temp/out.html" || { echo 'FAIL: style attr not stripped'; exit 1; }
grep -q 'css-url' "$temp/out.html" || { echo 'FAIL: text content lost when style stripped'; exit 1; }
grep -q 'fixed' "$temp/out.html" || { echo 'FAIL: text content lost when style stripped'; exit 1; }
grep -q 'safe style' "$temp/out.html" || { echo 'FAIL: text content lost when style stripped'; exit 1; }
echo 'CSS style stripping check OK'
rm -rf "$temp"
- run: |
temp=$(mktemp -d)
cat > "$temp/in.md" << 'INEOF'
# Img src

<img src="https://evil.com/track.png" alt="tracker">
<img src="local.png" alt="local">
INEOF
python3 scripts/markdown_to_html.py "$temp/in.md" "$temp/out.html"
! grep -qi '<img' "$temp/out.html" || { echo 'FAIL: img tag not stripped'; exit 1; }
echo 'Img stripping check OK'
rm -rf "$temp"
- run: python3 scripts/test_remote_block.py
- run: |
input_dir="/tmp/smoke test path"
mkdir -p "$input_dir"
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ This file is intentionally lightweight. Use concise entries that explain:
- `equipment selection / procurement / home-server planning` promoted to first-class route (routing priority #5), with provider-vs-equipment conflict rules and route-conflict examples
- route lists in `ARCHITECTURE.md`, `SYSTEM-MAP.md`, and `README.md` updated from six to seven mature routes
- `scripts/markdown_to_html.py`: hardened metadata HTML escaping and security model — `html.escape()` applied to all frontmatter-derived fields; `cover_meta` lines are escaped individually before joining with `<br>`
- `scripts/markdown_to_html.py`: added `nh3`-based body HTML sanitization — strips `<script>`, `<iframe>`, event handlers, `javascript:` URLs, inline `style` attributes, and `<img src>`; only safe tags and attributes (`class`, `id`, `href`, `colspan`, etc.) are allowed
- `scripts/render_pdf.py`: added `--allow-remote` flag (remote resources blocked by default); route glob fixed to cover all http/https URLs
- `scripts/md_to_pdf.py`: added `--allow-remote` passthrough (remote resources blocked by default)
- `requirements.txt`: added `nh3>=0.2`

### Added
- `references/mid-research-review.md`
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
markdown>=3.5
playwright>=1.40
nh3>=0.2
30 changes: 25 additions & 5 deletions scripts/markdown_to_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@

Security model: this script is designed for processing agent-authored
Deep Research reports. It escapes frontmatter-derived metadata fields
(title, cover_title, cover_subtitle, cover_meta) to prevent HTML injection.
Body HTML is produced by Python-Markdown with the 'extra' extension,
which preserves raw HTML in the markdown source. Do not pass untrusted
content to this script without sanitizing first.
(title, cover_title, cover_subtitle, cover_meta) and sanitizes body
HTML via nh3 to strip dangerous tags, event handlers, and javascript:
URLs. Only safe tags (p, div, table, a, code, pre, etc.) and
attributes (class, id, href, colspan, etc.) are allowed.
Inline style attributes, img tags, script, iframe, and event
handlers are removed. Remote HTTP/HTTPS resources are blocked by
default during PDF rendering (controlled via --allow-remote).

Usage:
python3 markdown_to_html.py <input.md> [output.html] [--title "Report Title"]
Expand Down Expand Up @@ -1214,6 +1217,22 @@ def is_bullet_placeholder(value):
return '\n'.join(repaired)


def sanitize_html(html_text):
"""Strip dangerous tags and attributes from HTML body output."""
import nh3
return nh3.clean(html_text, tags={
'p', 'br', 'hr', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'ul', 'ol', 'li', 'table', 'thead', 'tbody', 'tr', 'th', 'td',
'a', 'strong', 'em', 'b', 'i', 'u', 's', 'code', 'pre', 'blockquote',
'div', 'span', 'dl', 'dt', 'dd', 'abbr', 'cite', 'wbr',
}, attributes={
'a': {'href', 'title'},
'*': {'class', 'id'},
'td': {'colspan', 'rowspan'},
'th': {'colspan', 'rowspan'},
}, url_schemes={'http', 'https', 'mailto'})


def process_markdown(md_text):
"""Convert markdown to HTML using a real markdown parser, then post-process for report styling."""
import markdown
Expand All @@ -1228,7 +1247,8 @@ def process_markdown(md_text):
'nl2br',
]
html = markdown.markdown(md_text, extensions=extensions, output_format='html5')
return style_generated_html(html)
html = style_generated_html(html)
return sanitize_html(html)


# ─── Cover / Meta extraction ─────────────────────────────────────────────────
Expand Down
6 changes: 6 additions & 0 deletions scripts/md_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def main():
parser.add_argument('--margin-right', default='2.5cm')
parser.add_argument('--margin-bottom', default='2cm')
parser.add_argument('--margin-left', default='2.5cm')
parser.add_argument('--allow-remote', action='store_true', help='Allow HTTP/HTTPS resources during PDF rendering (default: blocked)')
args = parser.parse_args()

md_path = Path(args.input).resolve()
Expand Down Expand Up @@ -66,6 +67,11 @@ def main():
'--margin-bottom', args.margin_bottom,
'--margin-left', args.margin_left,
]
if not args.allow_remote:
# render_pdf.py blocks remote by default; explicitly pass --allow-remote to unblock
pass
else:
cmd.append('--allow-remote')
run(cmd, f"HTML → PDF: {pdf_path.name}")

print(f"\n✅ Complete: {pdf_path}")
Expand Down
8 changes: 7 additions & 1 deletion scripts/render_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ async def html_to_pdf(html_path, pdf_path=None, format="A4",
print_background=True, landscape=False,
prefer_css_page_size=True,
media="print",
title=None):
title=None,
block_remote=True):
"""
Render HTML file to PDF via Playwright Chromium.
"""
Expand All @@ -35,6 +36,9 @@ async def html_to_pdf(html_path, pdf_path=None, format="A4",
browser = await p.chromium.launch()
page = await browser.new_page()

if block_remote:
await page.route("**/*", lambda route: route.abort() if route.request.url.startswith(("http://", "https://")) else route.continue_())

file_url = f"file://{html_path}"
await page.goto(file_url, wait_until="networkidle")
await page.emulate_media(media=media)
Expand Down Expand Up @@ -79,6 +83,7 @@ async def html_to_pdf(html_path, pdf_path=None, format="A4",
parser.add_argument('--media', choices=['print', 'screen'], default='print', help='Emulated media type')
parser.add_argument('--no-bg', action='store_true', help='Disable background graphics')
parser.add_argument('--no-prefer-css-page-size', action='store_true', help='Ignore CSS @page size when rendering')
parser.add_argument('--allow-remote', action='store_true', help='Allow HTTP/HTTPS resource requests during PDF rendering (default: blocked)')
args = parser.parse_args()

asyncio.run(html_to_pdf(
Expand All @@ -94,4 +99,5 @@ async def html_to_pdf(html_path, pdf_path=None, format="A4",
media=args.media,
print_background=not args.no_bg,
title=args.title,
block_remote=not args.allow_remote,
))
56 changes: 56 additions & 0 deletions scripts/test_remote_block.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3
"""Verify that render_pdf.py blocks remote HTTP/S resources by default."""
import asyncio
import sys
import threading
import http.server
import pathlib
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.resolve()))
from render_pdf import html_to_pdf

hits = []

class Handler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
hits.append(self.path)
self.send_response(200)
self.end_headers()
self.wfile.write(b'OK')
def log_message(self, *a): pass

def main():
server = http.server.HTTPServer(('127.0.0.1', 0), Handler)
port = server.server_address[1]
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()

temp = Path('/tmp/test_remote_block')
temp.mkdir(parents=True, exist_ok=True)
html = f'<!DOCTYPE html><html><body><img src="http://127.0.0.1:{port}/pixel.png"></body></html>'
html_path = temp / 'remote.html'
html_path.write_text(html)

async def run():
hits.clear()
pdf_block = await html_to_pdf(str(html_path), str(temp / 'blocked.pdf'), block_remote=True)
blocked = len(hits)

hits.clear()
pdf_allow = await html_to_pdf(str(html_path), str(temp / 'allowed.pdf'), block_remote=False)
allowed = len(hits)

print(f'Blocked hits: {blocked}')
print(f'Allowed hits: {allowed}')

assert blocked == 0, f'Expected 0 hits when blocked, got {blocked}'
assert allowed >= 1, f'Expected >=1 hits when allowed, got {allowed}'
print('PASS')

asyncio.run(run())
server.shutdown()
return 0

if __name__ == '__main__':
sys.exit(main())
Loading