diff --git a/.dockerignore b/.dockerignore
index 207bdc74..5d9cc87f 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -4,7 +4,7 @@ dist
.nuxt
.nuxt-*
-.output
+# .output # temporarily allowed for local runtime build
.gen
.yarn/cache # Yarn 缓存
yarn-error.log
diff --git a/Dockerfile.runtime b/Dockerfile.runtime
new file mode 100644
index 00000000..d0887e28
--- /dev/null
+++ b/Dockerfile.runtime
@@ -0,0 +1,20 @@
+# Runtime-only Dockerfile: uses pre-built .output from host
+FROM node:22-alpine
+
+LABEL maintainer="findsource@proton.me" \
+ description="wechat-article-exporter Docker Image (local build)"
+
+WORKDIR /app
+
+# Copy pre-built output from host
+COPY .output ./
+
+# Create KV storage directory and set permissions
+RUN mkdir -p .data/kv && chown -R node:node /app
+
+USER node
+EXPOSE 3000
+
+ENV NODE_ENV=production HOST=0.0.0.0 PORT=3000
+
+ENTRYPOINT ["node", "server/index.mjs"]
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 00000000..9889504b
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,33 @@
+services:
+ # Main Nuxt/Nitro application
+ web:
+ image: wechat-article-exporter-web:local
+ build:
+ context: .
+ dockerfile: Dockerfile.runtime
+ ports:
+ - "3000:3000"
+ environment:
+ - NODE_ENV=production
+ - HOST=0.0.0.0
+ - PORT=3000
+ - SCRAPLING_SERVICE_URL=http://scrapling:8100
+ volumes:
+ - ./data/kv:/app/.data/kv
+ - ./data/articles:/tmp/wechat-articles
+ depends_on:
+ - scrapling
+ restart: unless-stopped
+
+ # Scrapling Python service for article parsing
+ scrapling:
+ build:
+ context: ./scrapling-service
+ dockerfile: Dockerfile
+ ports:
+ - "8100:8100"
+ environment:
+ - PYTHONUNBUFFERED=1
+ volumes:
+ - ./data/articles:/tmp/wechat-articles
+ restart: unless-stopped
diff --git a/scrapling-service/Dockerfile b/scrapling-service/Dockerfile
new file mode 100644
index 00000000..b5eac920
--- /dev/null
+++ b/scrapling-service/Dockerfile
@@ -0,0 +1,20 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Install system dependencies for Scrapling
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ gcc \
+ && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY main.py .
+
+# Create output directory
+RUN mkdir -p /tmp/wechat-articles
+
+EXPOSE 8100
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8100"]
diff --git a/scrapling-service/export_articles.py b/scrapling-service/export_articles.py
new file mode 100644
index 00000000..aefd913d
--- /dev/null
+++ b/scrapling-service/export_articles.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+WeChat Article Batch Export Script
+===================================
+Takes article HTML files and exports them to markdown + images
+using the Scrapling service.
+
+Usage:
+ # Export a single article HTML file
+ python3 export_articles.py /path/to/article.html
+
+ # Export multiple HTML files
+ python3 export_articles.py /path/to/*.html
+
+ # Specify output directory
+ python3 export_articles.py --output /path/to/output /path/to/*.html
+
+ # Use ZIP format (outputs individual ZIPs per article)
+ python3 export_articles.py --format zip /path/to/*.html
+
+Requirements:
+ - Scrapling service running at http://localhost:8100
+"""
+
+import argparse
+import os
+import sys
+import json
+
+try:
+ import httpx
+except ImportError:
+ print("Error: httpx not installed. Run: pip install httpx")
+ sys.exit(1)
+
+
+def export_article(html_path: str, output_dir: str, scrapling_url: str, fmt: str = "disk") -> dict:
+ """Export a single article HTML file."""
+ with open(html_path, "r", encoding="utf-8") as f:
+ html = f.read()
+
+ if fmt == "disk":
+ resp = httpx.post(f"{scrapling_url}/parse-to-disk", json={
+ "html": html,
+ "output_dir": output_dir,
+ }, timeout=120)
+ elif fmt == "zip":
+ resp = httpx.post(f"{scrapling_url}/parse", json={
+ "html": html,
+ "download_images": True,
+ "output_format": "zip",
+ }, timeout=120)
+
+ if resp.status_code == 200:
+ # Save ZIP
+ filename = os.path.splitext(os.path.basename(html_path))[0] + ".zip"
+ zip_path = os.path.join(output_dir, filename)
+ os.makedirs(output_dir, exist_ok=True)
+ with open(zip_path, "wb") as f:
+ f.write(resp.content)
+ return {"status": "ok", "path": zip_path, "size": len(resp.content)}
+
+ if resp.status_code == 200:
+ return resp.json()
+ else:
+ return {"status": "error", "code": resp.status_code, "detail": resp.text}
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Export WeChat article HTML to Markdown + Images")
+ parser.add_argument("files", nargs="+", help="HTML files to process")
+ parser.add_argument("--output", "-o", default="/tmp/wechat-articles", help="Output directory")
+ parser.add_argument("--format", "-f", choices=["disk", "zip"], default="disk", help="Output format")
+ parser.add_argument("--scrapling-url", default="http://localhost:8100", help="Scrapling service URL")
+ args = parser.parse_args()
+
+ # Check service health
+ try:
+ health = httpx.get(f"{args.scrapling_url}/health", timeout=5)
+ if health.status_code != 200:
+ print(f"Error: Scrapling service at {args.scrapling_url} is not healthy")
+ sys.exit(1)
+ except Exception as e:
+ print(f"Error: Cannot connect to Scrapling service at {args.scrapling_url}: {e}")
+ sys.exit(1)
+
+ print(f"Output directory: {args.output}")
+ print(f"Format: {args.format}")
+ print(f"Files to process: {len(args.files)}")
+ print()
+
+ success = 0
+ failed = 0
+
+ for i, filepath in enumerate(args.files):
+ if not os.path.exists(filepath):
+ print(f"[{i+1}/{len(args.files)}] ✗ File not found: {filepath}")
+ failed += 1
+ continue
+
+ result = export_article(filepath, args.output, args.scrapling_url, args.format)
+
+ if result.get("status") == "ok":
+ title = result.get("title", os.path.basename(filepath))
+ images = f"{result.get('image_count', '?')}/{result.get('total_images', '?')}"
+ print(f"[{i+1}/{len(args.files)}] ✓ {title[:60]} ({images} images)")
+ success += 1
+ else:
+ print(f"[{i+1}/{len(args.files)}] ✗ {filepath}: {result.get('detail', 'Unknown error')[:100]}")
+ failed += 1
+
+ print(f"\nDone: {success} success, {failed} failed")
+ print(f"Output: {args.output}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scrapling-service/main.py b/scrapling-service/main.py
new file mode 100644
index 00000000..4addab3d
--- /dev/null
+++ b/scrapling-service/main.py
@@ -0,0 +1,543 @@
+"""
+WeChat Article Scrapling Service
+---------------------------------
+A FastAPI microservice that uses Scrapling to parse WeChat article HTML,
+extract content, download images, and produce Markdown with local image references.
+
+Modes:
+1. POST /parse - Takes raw HTML, parses it, downloads images, returns markdown + images as ZIP
+2. GET /fetch - Takes a WeChat article URL, fetches it (may hit captcha), parses, returns markdown ZIP
+3. POST /parse-markdown - Takes raw HTML, returns just the markdown text (images as original URLs)
+"""
+
+import asyncio
+import hashlib
+import io
+import logging
+import os
+import re
+import tempfile
+import time
+import zipfile
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urljoin, urlparse
+
+import httpx
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.responses import StreamingResponse, JSONResponse
+from markdownify import markdownify as md
+from scrapling import Fetcher
+
+app = FastAPI(title="WeChat Article Scrapling Service", version="1.0.0")
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("scrapling")
+
+# Image download timeout
+IMAGE_TIMEOUT = 15
+# Max concurrent image downloads
+MAX_CONCURRENT_IMAGES = 5
+
+
+def parse_wechat_html(html: str) -> dict:
+ """
+ Use Scrapling to parse WeChat article HTML and extract structured content.
+ Returns dict with title, author, content_html, image_urls, publish_time.
+ """
+ from scrapling.parser import Adaptor
+
+ page = Adaptor(html, auto_match=False)
+
+ result = {
+ "title": "",
+ "author": "",
+ "account_name": "",
+ "content_html": "",
+ "image_urls": [],
+ "publish_time": "",
+ }
+
+ # Extract title
+ title_el = page.find("#activity-name") or page.find(".rich_media_title")
+ if title_el:
+ result["title"] = title_el.text.strip()
+ else:
+ title_el = page.find("title")
+ if title_el:
+ result["title"] = title_el.text.strip()
+
+ # Extract author
+ author_el = page.find("#js_author_name") or page.find(".rich_media_meta_text")
+ if author_el:
+ result["author"] = author_el.text.strip()
+
+ # Extract account name
+ account_el = page.find("#js_name") or page.find(".account_nickname_inner")
+ if account_el:
+ result["account_name"] = account_el.text.strip()
+
+ # Extract publish time
+ pub_time_el = page.find("#publish_time")
+ if pub_time_el:
+ result["publish_time"] = pub_time_el.text.strip()
+
+ # Extract main content
+ content_el = page.find("#js_content")
+ if content_el:
+ content_html = content_el.html_content
+
+ # Process images: convert data-src to src
+ content_html = re.sub(
+ r'
]*?)data-src="([^"]+)"',
+ r'
]*?)data-original="([^"]+)"',
+ lambda m: m.group(0) if 'src="' in m.group(0) else f'
]*?)data-src="([^"]+)"',
+ r'
]*?)data-original="([^"]+)"',
+ lambda m: m.group(0) if 'src="' in m.group(0) else f'
str:
+ """
+ Convert HTML content to Markdown, optionally replacing image URLs with local paths.
+ """
+ if image_map:
+ for orig_url, local_path in image_map.items():
+ content_html = content_html.replace(orig_url, local_path)
+
+ # Clean up the HTML before conversion
+ # Remove empty spans, divs etc
+ content_html = re.sub(r'<(span|div|section|p)\s+style="[^"]*">\s*\1>', '', content_html)
+
+ # Convert to markdown
+ markdown = md(
+ content_html,
+ heading_style="atx",
+ bullets="-",
+ strip=["script", "style", "iframe", "noscript"],
+ )
+
+ # Clean up excessive whitespace
+ markdown = re.sub(r'\n{3,}', '\n\n', markdown)
+ markdown = markdown.strip()
+
+ return markdown
+
+
+async def download_image(client: httpx.AsyncClient, url: str, semaphore: asyncio.Semaphore) -> tuple:
+ """Download a single image. Returns (url, bytes, content_type) or (url, None, None) on failure."""
+ async with semaphore:
+ try:
+ # Clean URL
+ clean_url = url.split("&")[0] if "&" in url else url
+
+ resp = await client.get(clean_url, timeout=IMAGE_TIMEOUT, follow_redirects=True)
+ if resp.status_code == 200:
+ content_type = resp.headers.get("content-type", "image/jpeg")
+ return (url, resp.content, content_type)
+ except Exception as e:
+ print(f"Failed to download image {url[:80]}: {e}")
+
+ return (url, None, None)
+
+
+async def download_images(image_urls: list) -> dict:
+ """
+ Download all images concurrently.
+ Returns dict mapping original URL -> (bytes, extension).
+ """
+ results = {}
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_IMAGES)
+
+ async with httpx.AsyncClient(
+ headers={
+ "Referer": "https://mp.weixin.qq.com/",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+ },
+ follow_redirects=True,
+ ) as client:
+ tasks = [download_image(client, url, semaphore) for url in image_urls]
+ done = await asyncio.gather(*tasks)
+
+ for url, data, content_type in done:
+ if data:
+ # Determine extension
+ ext = "jpg"
+ if content_type:
+ if "png" in content_type:
+ ext = "png"
+ elif "gif" in content_type:
+ ext = "gif"
+ elif "webp" in content_type:
+ ext = "webp"
+ elif "svg" in content_type:
+ ext = "svg"
+
+ # Generate filename from URL hash
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
+ filename = f"{url_hash}.{ext}"
+ results[url] = (data, filename)
+
+ return results
+
+
+def create_markdown_zip(title: str, markdown: str, images: dict) -> io.BytesIO:
+ """
+ Create a ZIP file containing:
+ - article.md (markdown with local image references)
+ - images/ directory with all downloaded images
+ """
+ buf = io.BytesIO()
+
+ # Sanitize title for directory name
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article"
+
+ with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+ zf.writestr(f"{safe_title}/{safe_title}.md", markdown)
+
+ for url, (data, filename) in images.items():
+ zf.writestr(f"{safe_title}/images/{filename}", data)
+
+ buf.seek(0)
+ return buf
+
+
+def save_markdown_to_disk(output_dir: str, title: str, markdown: str, images: dict) -> str:
+ """
+ Save markdown and images to disk.
+ Returns the path to the created directory.
+ """
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article"
+ article_dir = os.path.join(output_dir, safe_title)
+ images_dir = os.path.join(article_dir, "images")
+ os.makedirs(images_dir, exist_ok=True)
+
+ # Write markdown
+ md_path = os.path.join(article_dir, f"{safe_title}.md")
+ with open(md_path, "w", encoding="utf-8") as f:
+ f.write(markdown)
+
+ # Write images
+ for url, (data, filename) in images.items():
+ img_path = os.path.join(images_dir, filename)
+ with open(img_path, "wb") as f:
+ f.write(data)
+
+ return article_dir
+
+
+@app.get("/health")
+async def health():
+ return {"status": "ok", "service": "scrapling"}
+
+
+@app.post("/parse")
+async def parse_html(body: dict):
+ """
+ Parse WeChat article HTML and return markdown with images as a ZIP.
+
+ Request body:
+ {
+ "html": "",
+ "download_images": true, // optional, default true
+ "output_format": "zip" // "zip" or "json"
+ }
+ """
+ html = body.get("html", "")
+ if not html:
+ raise HTTPException(status_code=400, detail="html field is required")
+
+ download_imgs = body.get("download_images", True)
+ output_format = body.get("output_format", "zip")
+
+ # Parse the HTML
+ parsed = parse_wechat_html(html)
+
+ if not parsed["content_html"]:
+ html_len = len(html)
+ html_preview = html[:500].replace('\n', ' ')
+ logger.error(
+ f"parse 422: content_html empty. "
+ f"html_len={html_len}, title='{parsed.get('title', '')}', "
+ f"html_preview='{html_preview}'"
+ )
+ raise HTTPException(status_code=422, detail="Could not extract article content from HTML")
+
+ # Download images if requested
+ image_map = {}
+ downloaded_images = {}
+
+ if download_imgs and parsed["image_urls"]:
+ downloaded_images = await download_images(parsed["image_urls"])
+ # Build image map: original URL -> local relative path
+ for url, (data, filename) in downloaded_images.items():
+ image_map[url] = f"./images/{filename}"
+
+ # Convert to markdown
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+
+ # Add frontmatter
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ if output_format == "json":
+ return JSONResponse({
+ "title": parsed["title"],
+ "author": parsed["author"],
+ "account_name": parsed["account_name"],
+ "publish_time": parsed["publish_time"],
+ "markdown": full_markdown,
+ "image_count": len(downloaded_images),
+ "image_urls": parsed["image_urls"],
+ })
+
+ # Create ZIP
+ zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images)
+
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article"
+ # Use URL-encoded filename for non-ASCII compatibility
+ from urllib.parse import quote
+ encoded_title = quote(safe_title)
+
+ return StreamingResponse(
+ zip_buf,
+ media_type="application/zip",
+ headers={
+ "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"
+ }
+ )
+
+
+@app.post("/parse-to-disk")
+async def parse_to_disk(body: dict):
+ """
+ Parse WeChat article HTML and save markdown + images to disk.
+
+ Request body:
+ {
+ "html": "",
+ "output_dir": "/path/to/output" // optional, defaults to /tmp/wechat-articles
+ }
+ """
+ html = body.get("html", "")
+ if not html:
+ raise HTTPException(status_code=400, detail="html field is required")
+
+ output_dir = body.get("output_dir", "/tmp/wechat-articles")
+
+ # Parse
+ parsed = parse_wechat_html(html)
+ if not parsed["content_html"]:
+ # Log diagnostic info for debugging
+ html_len = len(html)
+ html_preview = html[:500].replace('\n', ' ')
+ title_found = parsed.get("title", "")
+ logger.error(
+ f"parse-to-disk 422: content_html empty. "
+ f"html_len={html_len}, title='{title_found}', "
+ f"html_preview='{html_preview}'"
+ )
+ raise HTTPException(status_code=422, detail="Could not extract article content")
+
+ # Download images
+ downloaded_images = {}
+ image_map = {}
+
+ if parsed["image_urls"]:
+ downloaded_images = await download_images(parsed["image_urls"])
+ for url, (data, filename) in downloaded_images.items():
+ image_map[url] = f"./images/{filename}"
+
+ # Convert to markdown
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ # Save to disk
+ article_dir = save_markdown_to_disk(output_dir, parsed["title"], full_markdown, downloaded_images)
+
+ return {
+ "status": "ok",
+ "title": parsed["title"],
+ "path": article_dir,
+ "markdown_file": os.path.join(article_dir, f"{re.sub(r'[<>:\"/\\\\|?*]', '_', parsed['title'])[:80]}.md"),
+ "image_count": len(downloaded_images),
+ "total_images": len(parsed["image_urls"]),
+ }
+
+
+@app.get("/fetch")
+async def fetch_article(
+ url: str = Query(..., description="WeChat article URL"),
+ output_format: str = Query("zip", description="Output format: zip or json"),
+):
+ """
+ Fetch a WeChat article by URL, parse it, and return markdown.
+ Note: May fail due to WeChat captcha/anti-bot protection.
+ """
+ try:
+ fetcher = Fetcher()
+ page = fetcher.get(url, headers={
+ "Referer": "https://mp.weixin.qq.com/",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+ })
+ html = page.html_content
+ except Exception as e:
+ raise HTTPException(status_code=502, detail=f"Failed to fetch article: {e}")
+
+ # Check if we hit a captcha/verification page
+ if 'captcha' in html.lower() or 'verify' in html.lower():
+ raise HTTPException(
+ status_code=403,
+ detail="WeChat captcha/verification detected. Use /parse endpoint with pre-fetched HTML instead."
+ )
+
+ parsed = parse_wechat_html(html)
+ if not parsed["content_html"]:
+ raise HTTPException(status_code=422, detail="Could not extract article content")
+
+ downloaded_images = await download_images(parsed["image_urls"])
+ image_map = {url: f"./images/{fn}" for url, (_, fn) in downloaded_images.items()}
+
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ if output_format == "json":
+ return JSONResponse({
+ "title": parsed["title"],
+ "markdown": full_markdown,
+ "image_count": len(downloaded_images),
+ })
+
+ zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images)
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article"
+ from urllib.parse import quote
+ encoded_title = quote(safe_title)
+
+ return StreamingResponse(
+ zip_buf,
+ media_type="application/zip",
+ headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"}
+ )
+
+
+if __name__ == "__main__":
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8100)
diff --git a/scrapling-service/main_simple.py b/scrapling-service/main_simple.py
new file mode 100644
index 00000000..ddf1f6da
--- /dev/null
+++ b/scrapling-service/main_simple.py
@@ -0,0 +1,461 @@
+"""
+WeChat Article Scrapling Service
+---------------------------------
+A FastAPI microservice that uses Scrapling to parse WeChat article HTML,
+extract content, download images, and produce Markdown with local image references.
+
+Modes:
+1. POST /parse - Takes raw HTML, parses it, downloads images, returns markdown + images as ZIP
+2. GET /fetch - Takes a WeChat article URL, fetches it (may hit captcha), parses, returns markdown ZIP
+3. POST /parse-markdown - Takes raw HTML, returns just the markdown text (images as original URLs)
+"""
+
+import asyncio
+import hashlib
+import io
+import os
+import re
+import tempfile
+import time
+import zipfile
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urljoin, urlparse
+
+import httpx
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.responses import StreamingResponse, JSONResponse
+from markdownify import markdownify as md
+from scrapling import Fetcher
+
+app = FastAPI(title="WeChat Article Scrapling Service", version="1.0.0")
+
+# Image download timeout
+IMAGE_TIMEOUT = 15
+# Max concurrent image downloads
+MAX_CONCURRENT_IMAGES = 5
+
+
+def parse_wechat_html(html: str) -> dict:
+ """
+ Use Scrapling to parse WeChat article HTML and extract structured content.
+ Returns dict with title, author, content_html, image_urls, publish_time.
+ """
+ from scrapling.parser import Adaptor
+
+ page = Adaptor(html, auto_match=False)
+
+ result = {
+ "title": "",
+ "author": "",
+ "account_name": "",
+ "content_html": "",
+ "image_urls": [],
+ "publish_time": "",
+ }
+
+ # Extract title
+ title_el = page.find("#activity-name") or page.find(".rich_media_title")
+ if title_el:
+ result["title"] = title_el.text.strip()
+ else:
+ title_el = page.find("title")
+ if title_el:
+ result["title"] = title_el.text.strip()
+
+ # Extract author
+ author_el = page.find("#js_author_name") or page.find(".rich_media_meta_text")
+ if author_el:
+ result["author"] = author_el.text.strip()
+
+ # Extract account name
+ account_el = page.find("#js_name") or page.find(".account_nickname_inner")
+ if account_el:
+ result["account_name"] = account_el.text.strip()
+
+ # Extract publish time
+ pub_time_el = page.find("#publish_time")
+ if pub_time_el:
+ result["publish_time"] = pub_time_el.text.strip()
+
+ # Extract main content
+ content_el = page.find("#js_content")
+ if content_el:
+ content_html = content_el.html_content
+
+ # Process images: convert data-src to src
+ content_html = re.sub(
+ r'
]*?)data-src="([^"]+)"',
+ r'
]*?)data-original="([^"]+)"',
+ lambda m: m.group(0) if 'src="' in m.group(0) else f'
str:
+ """
+ Convert HTML content to Markdown, optionally replacing image URLs with local paths.
+ """
+ if image_map:
+ for orig_url, local_path in image_map.items():
+ content_html = content_html.replace(orig_url, local_path)
+
+ # Clean up the HTML before conversion
+ # Remove empty spans, divs etc
+ content_html = re.sub(r'<(span|div|section|p)\s+style="[^"]*">\s*\1>', '', content_html)
+
+ # Convert to markdown
+ markdown = md(
+ content_html,
+ heading_style="atx",
+ bullets="-",
+ strip=["script", "style", "iframe", "noscript"],
+ )
+
+ # Clean up excessive whitespace
+ markdown = re.sub(r'\n{3,}', '\n\n', markdown)
+ markdown = markdown.strip()
+
+ return markdown
+
+
+async def download_image(client: httpx.AsyncClient, url: str, semaphore: asyncio.Semaphore) -> tuple:
+ """Download a single image. Returns (url, bytes, content_type) or (url, None, None) on failure."""
+ async with semaphore:
+ try:
+ # Clean URL
+ clean_url = url.split("&")[0] if "&" in url else url
+
+ resp = await client.get(clean_url, timeout=IMAGE_TIMEOUT, follow_redirects=True)
+ if resp.status_code == 200:
+ content_type = resp.headers.get("content-type", "image/jpeg")
+ return (url, resp.content, content_type)
+ except Exception as e:
+ print(f"Failed to download image {url[:80]}: {e}")
+
+ return (url, None, None)
+
+
+async def download_images(image_urls: list) -> dict:
+ """
+ Download all images concurrently.
+ Returns dict mapping original URL -> (bytes, extension).
+ """
+ results = {}
+ semaphore = asyncio.Semaphore(MAX_CONCURRENT_IMAGES)
+
+ async with httpx.AsyncClient(
+ headers={
+ "Referer": "https://mp.weixin.qq.com/",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
+ },
+ follow_redirects=True,
+ ) as client:
+ tasks = [download_image(client, url, semaphore) for url in image_urls]
+ done = await asyncio.gather(*tasks)
+
+ for url, data, content_type in done:
+ if data:
+ # Determine extension
+ ext = "jpg"
+ if content_type:
+ if "png" in content_type:
+ ext = "png"
+ elif "gif" in content_type:
+ ext = "gif"
+ elif "webp" in content_type:
+ ext = "webp"
+ elif "svg" in content_type:
+ ext = "svg"
+
+ # Generate filename from URL hash
+ url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
+ filename = f"{url_hash}.{ext}"
+ results[url] = (data, filename)
+
+ return results
+
+
+def create_markdown_zip(title: str, markdown: str, images: dict) -> io.BytesIO:
+ """
+ Create a ZIP file containing:
+ - article.md (markdown with local image references)
+ - images/ directory with all downloaded images
+ """
+ buf = io.BytesIO()
+
+ # Sanitize title for directory name
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article"
+
+ with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+ zf.writestr(f"{safe_title}/{safe_title}.md", markdown)
+
+ for url, (data, filename) in images.items():
+ zf.writestr(f"{safe_title}/images/{filename}", data)
+
+ buf.seek(0)
+ return buf
+
+
+def save_markdown_to_disk(output_dir: str, title: str, markdown: str, images: dict) -> str:
+ """
+ Save markdown and images to disk.
+ Returns the path to the created directory.
+ """
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article"
+ article_dir = os.path.join(output_dir, safe_title)
+ images_dir = os.path.join(article_dir, "images")
+ os.makedirs(images_dir, exist_ok=True)
+
+ # Write markdown
+ md_path = os.path.join(article_dir, f"{safe_title}.md")
+ with open(md_path, "w", encoding="utf-8") as f:
+ f.write(markdown)
+
+ # Write images
+ for url, (data, filename) in images.items():
+ img_path = os.path.join(images_dir, filename)
+ with open(img_path, "wb") as f:
+ f.write(data)
+
+ return article_dir
+
+
+@app.get("/health")
+async def health():
+ return {"status": "ok", "service": "scrapling"}
+
+
+@app.post("/parse")
+async def parse_html(body: dict):
+ """
+ Parse WeChat article HTML and return markdown with images as a ZIP.
+
+ Request body:
+ {
+ "html": "",
+ "download_images": true, // optional, default true
+ "output_format": "zip" // "zip" or "json"
+ }
+ """
+ html = body.get("html", "")
+ if not html:
+ raise HTTPException(status_code=400, detail="html field is required")
+
+ download_imgs = body.get("download_images", True)
+ output_format = body.get("output_format", "zip")
+
+ # Parse the HTML
+ parsed = parse_wechat_html(html)
+
+ if not parsed["content_html"]:
+ raise HTTPException(status_code=422, detail="Could not extract article content from HTML")
+
+ # Download images if requested
+ image_map = {}
+ downloaded_images = {}
+
+ if download_imgs and parsed["image_urls"]:
+ downloaded_images = await download_images(parsed["image_urls"])
+ # Build image map: original URL -> local relative path
+ for url, (data, filename) in downloaded_images.items():
+ image_map[url] = f"./images/{filename}"
+
+ # Convert to markdown
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+
+ # Add frontmatter
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ if output_format == "json":
+ return JSONResponse({
+ "title": parsed["title"],
+ "author": parsed["author"],
+ "account_name": parsed["account_name"],
+ "publish_time": parsed["publish_time"],
+ "markdown": full_markdown,
+ "image_count": len(downloaded_images),
+ "image_urls": parsed["image_urls"],
+ })
+
+ # Create ZIP
+ zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images)
+
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article"
+ # Use URL-encoded filename for non-ASCII compatibility
+ from urllib.parse import quote
+ encoded_title = quote(safe_title)
+
+ return StreamingResponse(
+ zip_buf,
+ media_type="application/zip",
+ headers={
+ "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"
+ }
+ )
+
+
+@app.post("/parse-to-disk")
+async def parse_to_disk(body: dict):
+ """
+ Parse WeChat article HTML and save markdown + images to disk.
+
+ Request body:
+ {
+ "html": "",
+ "output_dir": "/path/to/output" // optional, defaults to /tmp/wechat-articles
+ }
+ """
+ html = body.get("html", "")
+ if not html:
+ raise HTTPException(status_code=400, detail="html field is required")
+
+ output_dir = body.get("output_dir", "/tmp/wechat-articles")
+
+ # Parse
+ parsed = parse_wechat_html(html)
+ if not parsed["content_html"]:
+ raise HTTPException(status_code=422, detail="Could not extract article content")
+
+ # Download images
+ downloaded_images = {}
+ image_map = {}
+
+ if parsed["image_urls"]:
+ downloaded_images = await download_images(parsed["image_urls"])
+ for url, (data, filename) in downloaded_images.items():
+ image_map[url] = f"./images/{filename}"
+
+ # Convert to markdown
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ # Save to disk
+ article_dir = save_markdown_to_disk(output_dir, parsed["title"], full_markdown, downloaded_images)
+
+ return {
+ "status": "ok",
+ "title": parsed["title"],
+ "path": article_dir,
+ "markdown_file": os.path.join(article_dir, f"{re.sub(r'[<>:\"/\\\\|?*]', '_', parsed['title'])[:80]}.md"),
+ "image_count": len(downloaded_images),
+ "total_images": len(parsed["image_urls"]),
+ }
+
+
+@app.get("/fetch")
+async def fetch_article(
+ url: str = Query(..., description="WeChat article URL"),
+ output_format: str = Query("zip", description="Output format: zip or json"),
+):
+ """
+ Fetch a WeChat article by URL, parse it, and return markdown.
+ Note: May fail due to WeChat captcha/anti-bot protection.
+ """
+ try:
+ fetcher = Fetcher()
+ page = fetcher.get(url, headers={
+ "Referer": "https://mp.weixin.qq.com/",
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+ })
+ html = page.html_content
+ except Exception as e:
+ raise HTTPException(status_code=502, detail=f"Failed to fetch article: {e}")
+
+ # Check if we hit a captcha/verification page
+ if 'captcha' in html.lower() or 'verify' in html.lower():
+ raise HTTPException(
+ status_code=403,
+ detail="WeChat captcha/verification detected. Use /parse endpoint with pre-fetched HTML instead."
+ )
+
+ parsed = parse_wechat_html(html)
+ if not parsed["content_html"]:
+ raise HTTPException(status_code=422, detail="Could not extract article content")
+
+ downloaded_images = await download_images(parsed["image_urls"])
+ image_map = {url: f"./images/{fn}" for url, (_, fn) in downloaded_images.items()}
+
+ markdown = html_to_markdown(parsed["content_html"], image_map)
+ frontmatter = f"""---
+title: "{parsed['title']}"
+author: "{parsed['author']}"
+account: "{parsed['account_name']}"
+date: "{parsed['publish_time']}"
+---
+
+"""
+ full_markdown = frontmatter + markdown
+
+ if output_format == "json":
+ return JSONResponse({
+ "title": parsed["title"],
+ "markdown": full_markdown,
+ "image_count": len(downloaded_images),
+ })
+
+ zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images)
+ safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article"
+ from urllib.parse import quote
+ encoded_title = quote(safe_title)
+
+ return StreamingResponse(
+ zip_buf,
+ media_type="application/zip",
+ headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"}
+ )
+
+
+if __name__ == "__main__":
+ import uvicorn
+ uvicorn.run(app, host="0.0.0.0", port=8100)
diff --git a/scrapling-service/requirements.txt b/scrapling-service/requirements.txt
new file mode 100644
index 00000000..0d7c94d8
--- /dev/null
+++ b/scrapling-service/requirements.txt
@@ -0,0 +1,9 @@
+scrapling>=0.4.0
+fastapi>=0.100.0
+uvicorn>=0.20.0
+markdownify>=0.12.0
+httpx>=0.24.0
+Pillow>=10.0.0
+curl-cffi>=0.5.0
+playwright>=1.40.0
+browserforge>=1.1.0
diff --git a/server/api/public/v1/batch-download.post.ts b/server/api/public/v1/batch-download.post.ts
new file mode 100644
index 00000000..bacc7388
--- /dev/null
+++ b/server/api/public/v1/batch-download.post.ts
@@ -0,0 +1,105 @@
+/**
+ * Batch download endpoint using Scrapling service.
+ *
+ * POST /api/public/v1/batch-download
+ *
+ * Body:
+ * urls – array of WeChat article URLs (required, max 50)
+ * format – markdown-zip | markdown-json (default: markdown-zip)
+ * output_dir – directory to save files (optional, for server-side output)
+ *
+ * This endpoint fetches each article HTML, sends to Scrapling for parsing,
+ * and returns a single ZIP containing all articles with images.
+ */
+
+import { urlIsValidMpArticle } from '#shared/utils';
+import { USER_AGENT } from '~/config';
+
+const SCRAPLING_URL = process.env.SCRAPLING_SERVICE_URL || 'http://localhost:8100';
+
+export default defineEventHandler(async event => {
+ const body = await readBody(event);
+
+ if (!body.urls || !Array.isArray(body.urls) || body.urls.length === 0) {
+ return {
+ base_resp: {
+ ret: -1,
+ err_msg: 'urls数组不能为空',
+ },
+ };
+ }
+
+ if (body.urls.length > 50) {
+ return {
+ base_resp: {
+ ret: -1,
+ err_msg: '每次最多处理50篇文章',
+ },
+ };
+ }
+
+ const format: string = (body.format || 'markdown-zip').toLowerCase();
+ const outputDir: string = body.output_dir || '/tmp/wechat-articles';
+ const results: any[] = [];
+ const errors: any[] = [];
+
+ for (let i = 0; i < body.urls.length; i++) {
+ const url = decodeURIComponent(body.urls[i].trim());
+
+ if (!urlIsValidMpArticle(url)) {
+ errors.push({ url, error: 'url不合法' });
+ continue;
+ }
+
+ try {
+ // 1. Fetch article HTML with proper headers to avoid WeChat anti-bot
+ const rawHtml = await fetch(url, {
+ headers: {
+ Referer: 'https://mp.weixin.qq.com/',
+ Origin: 'https://mp.weixin.qq.com',
+ 'User-Agent': USER_AGENT,
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+ },
+ redirect: 'follow',
+ }).then(res => res.text());
+
+ // 2. Send to Scrapling service for parsing and saving to disk
+ const scraplingResp = await fetch(`${SCRAPLING_URL}/parse-to-disk`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ html: rawHtml,
+ output_dir: outputDir,
+ }),
+ });
+
+ if (scraplingResp.ok) {
+ const data = await scraplingResp.json() as any;
+ results.push({
+ url,
+ title: data.title,
+ path: data.path,
+ markdown_file: data.markdown_file,
+ image_count: data.image_count,
+ total_images: data.total_images,
+ });
+ } else {
+ const errorText = await scraplingResp.text();
+ errors.push({ url, error: `Scrapling error: ${errorText}` });
+ }
+ } catch (error: any) {
+ errors.push({ url, error: error.message });
+ }
+ }
+
+ return {
+ base_resp: { ret: 0 },
+ total: body.urls.length,
+ success: results.length,
+ failed: errors.length,
+ results,
+ errors,
+ output_dir: outputDir,
+ };
+});
diff --git a/server/api/public/v1/download.get.ts b/server/api/public/v1/download.get.ts
index 0716b7a6..ff460442 100644
--- a/server/api/public/v1/download.get.ts
+++ b/server/api/public/v1/download.get.ts
@@ -3,13 +3,27 @@ import { urlIsValidMpArticle } from '#shared/utils';
import { normalizeHtml, parseCgiDataNew } from '#shared/utils/html';
import { USER_AGENT } from '~/config';
-interface SearchBizQuery {
+/**
+ * Enhanced download endpoint using Scrapling service for markdown+images.
+ *
+ * Query params:
+ * url – WeChat article URL (required)
+ * format – html | markdown | text | json | markdown-zip (default: html)
+ *
+ * `markdown-zip` format delegates to the Scrapling Python sidecar
+ * and returns a ZIP containing `.md` + `images/` folder.
+ */
+
+interface DownloadQuery {
url: string;
format: string;
}
+// Scrapling service URL – can be set via env or defaults to localhost
+const SCRAPLING_URL = process.env.SCRAPLING_SERVICE_URL || 'http://localhost:8100';
+
export default defineEventHandler(async event => {
- const query = getQuery(event);
+ const query = getQuery(event);
if (!query.url) {
return {
base_resp: {
@@ -30,15 +44,16 @@ export default defineEventHandler(async event => {
}
const format: string = (query.format || 'html').toLowerCase();
- if (!['html', 'markdown', 'text', 'json'].includes(format)) {
+ if (!['html', 'markdown', 'text', 'json', 'markdown-zip'].includes(format)) {
return {
base_resp: {
ret: -1,
- err_msg: '不支持的format',
+ err_msg: '不支持的format,可选: html, markdown, text, json, markdown-zip',
},
};
}
+ // Fetch the raw HTML from WeChat
const rawHtml = await fetch(url, {
headers: {
Referer: 'https://mp.weixin.qq.com/',
@@ -47,6 +62,88 @@ export default defineEventHandler(async event => {
},
}).then(res => res.text());
+ // For markdown-zip: delegate to Scrapling service
+ if (format === 'markdown-zip') {
+ try {
+ const scraplingResp = await fetch(`${SCRAPLING_URL}/parse`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ html: rawHtml,
+ download_images: true,
+ output_format: 'zip',
+ }),
+ });
+
+ if (!scraplingResp.ok) {
+ const errorBody = await scraplingResp.text();
+ return {
+ base_resp: {
+ ret: -2,
+ err_msg: `Scrapling service error: ${errorBody}`,
+ },
+ };
+ }
+
+ // Stream the ZIP response back
+ const zipBuffer = await scraplingResp.arrayBuffer();
+ const contentDisposition = scraplingResp.headers.get('content-disposition') || 'attachment; filename="article.zip"';
+
+ return new Response(zipBuffer, {
+ status: 200,
+ headers: {
+ 'Content-Type': 'application/zip',
+ 'Content-Disposition': contentDisposition,
+ },
+ });
+ } catch (error: any) {
+ return {
+ base_resp: {
+ ret: -3,
+ err_msg: `Failed to connect to Scrapling service at ${SCRAPLING_URL}: ${error.message}`,
+ },
+ };
+ }
+ }
+
+ // For markdown: try Scrapling first, fallback to basic conversion
+ if (format === 'markdown') {
+ try {
+ const scraplingResp = await fetch(`${SCRAPLING_URL}/parse`, {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({
+ html: rawHtml,
+ download_images: false,
+ output_format: 'json',
+ }),
+ });
+
+ if (scraplingResp.ok) {
+ const data = await scraplingResp.json() as any;
+ if (data.markdown) {
+ return new Response(data.markdown, {
+ status: 200,
+ headers: {
+ 'Content-Type': 'text/markdown; charset=UTF-8',
+ },
+ });
+ }
+ }
+ } catch {
+ // Scrapling service unavailable, fallback to basic conversion
+ }
+
+ // Fallback: basic TurndownService conversion
+ return new Response(new TurndownService().turndown(normalizeHtml(rawHtml, 'html')), {
+ status: 200,
+ headers: {
+ 'Content-Type': 'text/markdown; charset=UTF-8',
+ },
+ });
+ }
+
+ // Original formats
switch (format) {
case 'html':
return new Response(normalizeHtml(rawHtml, 'html'), {
@@ -62,13 +159,6 @@ export default defineEventHandler(async event => {
'Content-Type': 'text/plain; charset=UTF-8',
},
});
- case 'markdown':
- return new Response(new TurndownService().turndown(normalizeHtml(rawHtml, 'html')), {
- status: 200,
- headers: {
- 'Content-Type': 'text/markdown; charset=UTF-8',
- },
- });
case 'json':
return await parseCgiDataNew(rawHtml);
default: