diff --git a/.dockerignore b/.dockerignore index 207bdc74..5d9cc87f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,7 +4,7 @@ dist .nuxt .nuxt-* -.output +# .output # temporarily allowed for local runtime build .gen .yarn/cache # Yarn 缓存 yarn-error.log diff --git a/Dockerfile.runtime b/Dockerfile.runtime new file mode 100644 index 00000000..d0887e28 --- /dev/null +++ b/Dockerfile.runtime @@ -0,0 +1,20 @@ +# Runtime-only Dockerfile: uses pre-built .output from host +FROM node:22-alpine + +LABEL maintainer="findsource@proton.me" \ + description="wechat-article-exporter Docker Image (local build)" + +WORKDIR /app + +# Copy pre-built output from host +COPY .output ./ + +# Create KV storage directory and set permissions +RUN mkdir -p .data/kv && chown -R node:node /app + +USER node +EXPOSE 3000 + +ENV NODE_ENV=production HOST=0.0.0.0 PORT=3000 + +ENTRYPOINT ["node", "server/index.mjs"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..9889504b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +services: + # Main Nuxt/Nitro application + web: + image: wechat-article-exporter-web:local + build: + context: . + dockerfile: Dockerfile.runtime + ports: + - "3000:3000" + environment: + - NODE_ENV=production + - HOST=0.0.0.0 + - PORT=3000 + - SCRAPLING_SERVICE_URL=http://scrapling:8100 + volumes: + - ./data/kv:/app/.data/kv + - ./data/articles:/tmp/wechat-articles + depends_on: + - scrapling + restart: unless-stopped + + # Scrapling Python service for article parsing + scrapling: + build: + context: ./scrapling-service + dockerfile: Dockerfile + ports: + - "8100:8100" + environment: + - PYTHONUNBUFFERED=1 + volumes: + - ./data/articles:/tmp/wechat-articles + restart: unless-stopped diff --git a/scrapling-service/Dockerfile b/scrapling-service/Dockerfile new file mode 100644 index 00000000..b5eac920 --- /dev/null +++ b/scrapling-service/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12-slim + +WORKDIR /app + +# Install system dependencies for Scrapling +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY main.py . + +# Create output directory +RUN mkdir -p /tmp/wechat-articles + +EXPOSE 8100 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8100"] diff --git a/scrapling-service/export_articles.py b/scrapling-service/export_articles.py new file mode 100644 index 00000000..aefd913d --- /dev/null +++ b/scrapling-service/export_articles.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +WeChat Article Batch Export Script +=================================== +Takes article HTML files and exports them to markdown + images +using the Scrapling service. + +Usage: + # Export a single article HTML file + python3 export_articles.py /path/to/article.html + + # Export multiple HTML files + python3 export_articles.py /path/to/*.html + + # Specify output directory + python3 export_articles.py --output /path/to/output /path/to/*.html + + # Use ZIP format (outputs individual ZIPs per article) + python3 export_articles.py --format zip /path/to/*.html + +Requirements: + - Scrapling service running at http://localhost:8100 +""" + +import argparse +import os +import sys +import json + +try: + import httpx +except ImportError: + print("Error: httpx not installed. Run: pip install httpx") + sys.exit(1) + + +def export_article(html_path: str, output_dir: str, scrapling_url: str, fmt: str = "disk") -> dict: + """Export a single article HTML file.""" + with open(html_path, "r", encoding="utf-8") as f: + html = f.read() + + if fmt == "disk": + resp = httpx.post(f"{scrapling_url}/parse-to-disk", json={ + "html": html, + "output_dir": output_dir, + }, timeout=120) + elif fmt == "zip": + resp = httpx.post(f"{scrapling_url}/parse", json={ + "html": html, + "download_images": True, + "output_format": "zip", + }, timeout=120) + + if resp.status_code == 200: + # Save ZIP + filename = os.path.splitext(os.path.basename(html_path))[0] + ".zip" + zip_path = os.path.join(output_dir, filename) + os.makedirs(output_dir, exist_ok=True) + with open(zip_path, "wb") as f: + f.write(resp.content) + return {"status": "ok", "path": zip_path, "size": len(resp.content)} + + if resp.status_code == 200: + return resp.json() + else: + return {"status": "error", "code": resp.status_code, "detail": resp.text} + + +def main(): + parser = argparse.ArgumentParser(description="Export WeChat article HTML to Markdown + Images") + parser.add_argument("files", nargs="+", help="HTML files to process") + parser.add_argument("--output", "-o", default="/tmp/wechat-articles", help="Output directory") + parser.add_argument("--format", "-f", choices=["disk", "zip"], default="disk", help="Output format") + parser.add_argument("--scrapling-url", default="http://localhost:8100", help="Scrapling service URL") + args = parser.parse_args() + + # Check service health + try: + health = httpx.get(f"{args.scrapling_url}/health", timeout=5) + if health.status_code != 200: + print(f"Error: Scrapling service at {args.scrapling_url} is not healthy") + sys.exit(1) + except Exception as e: + print(f"Error: Cannot connect to Scrapling service at {args.scrapling_url}: {e}") + sys.exit(1) + + print(f"Output directory: {args.output}") + print(f"Format: {args.format}") + print(f"Files to process: {len(args.files)}") + print() + + success = 0 + failed = 0 + + for i, filepath in enumerate(args.files): + if not os.path.exists(filepath): + print(f"[{i+1}/{len(args.files)}] ✗ File not found: {filepath}") + failed += 1 + continue + + result = export_article(filepath, args.output, args.scrapling_url, args.format) + + if result.get("status") == "ok": + title = result.get("title", os.path.basename(filepath)) + images = f"{result.get('image_count', '?')}/{result.get('total_images', '?')}" + print(f"[{i+1}/{len(args.files)}] ✓ {title[:60]} ({images} images)") + success += 1 + else: + print(f"[{i+1}/{len(args.files)}] ✗ {filepath}: {result.get('detail', 'Unknown error')[:100]}") + failed += 1 + + print(f"\nDone: {success} success, {failed} failed") + print(f"Output: {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scrapling-service/main.py b/scrapling-service/main.py new file mode 100644 index 00000000..4addab3d --- /dev/null +++ b/scrapling-service/main.py @@ -0,0 +1,543 @@ +""" +WeChat Article Scrapling Service +--------------------------------- +A FastAPI microservice that uses Scrapling to parse WeChat article HTML, +extract content, download images, and produce Markdown with local image references. + +Modes: +1. POST /parse - Takes raw HTML, parses it, downloads images, returns markdown + images as ZIP +2. GET /fetch - Takes a WeChat article URL, fetches it (may hit captcha), parses, returns markdown ZIP +3. POST /parse-markdown - Takes raw HTML, returns just the markdown text (images as original URLs) +""" + +import asyncio +import hashlib +import io +import logging +import os +import re +import tempfile +import time +import zipfile +from pathlib import Path +from typing import Optional +from urllib.parse import urljoin, urlparse + +import httpx +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import StreamingResponse, JSONResponse +from markdownify import markdownify as md +from scrapling import Fetcher + +app = FastAPI(title="WeChat Article Scrapling Service", version="1.0.0") + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger("scrapling") + +# Image download timeout +IMAGE_TIMEOUT = 15 +# Max concurrent image downloads +MAX_CONCURRENT_IMAGES = 5 + + +def parse_wechat_html(html: str) -> dict: + """ + Use Scrapling to parse WeChat article HTML and extract structured content. + Returns dict with title, author, content_html, image_urls, publish_time. + """ + from scrapling.parser import Adaptor + + page = Adaptor(html, auto_match=False) + + result = { + "title": "", + "author": "", + "account_name": "", + "content_html": "", + "image_urls": [], + "publish_time": "", + } + + # Extract title + title_el = page.find("#activity-name") or page.find(".rich_media_title") + if title_el: + result["title"] = title_el.text.strip() + else: + title_el = page.find("title") + if title_el: + result["title"] = title_el.text.strip() + + # Extract author + author_el = page.find("#js_author_name") or page.find(".rich_media_meta_text") + if author_el: + result["author"] = author_el.text.strip() + + # Extract account name + account_el = page.find("#js_name") or page.find(".account_nickname_inner") + if account_el: + result["account_name"] = account_el.text.strip() + + # Extract publish time + pub_time_el = page.find("#publish_time") + if pub_time_el: + result["publish_time"] = pub_time_el.text.strip() + + # Extract main content + content_el = page.find("#js_content") + if content_el: + content_html = content_el.html_content + + # Process images: convert data-src to src + content_html = re.sub( + r']*?)data-src="([^"]+)"', + r']*?)data-original="([^"]+)"', + lambda m: m.group(0) if 'src="' in m.group(0) else f']*?)data-src="([^"]+)"', + r']*?)data-original="([^"]+)"', + lambda m: m.group(0) if 'src="' in m.group(0) else f' str: + """ + Convert HTML content to Markdown, optionally replacing image URLs with local paths. + """ + if image_map: + for orig_url, local_path in image_map.items(): + content_html = content_html.replace(orig_url, local_path) + + # Clean up the HTML before conversion + # Remove empty spans, divs etc + content_html = re.sub(r'<(span|div|section|p)\s+style="[^"]*">\s*', '', content_html) + + # Convert to markdown + markdown = md( + content_html, + heading_style="atx", + bullets="-", + strip=["script", "style", "iframe", "noscript"], + ) + + # Clean up excessive whitespace + markdown = re.sub(r'\n{3,}', '\n\n', markdown) + markdown = markdown.strip() + + return markdown + + +async def download_image(client: httpx.AsyncClient, url: str, semaphore: asyncio.Semaphore) -> tuple: + """Download a single image. Returns (url, bytes, content_type) or (url, None, None) on failure.""" + async with semaphore: + try: + # Clean URL + clean_url = url.split("&")[0] if "&" in url else url + + resp = await client.get(clean_url, timeout=IMAGE_TIMEOUT, follow_redirects=True) + if resp.status_code == 200: + content_type = resp.headers.get("content-type", "image/jpeg") + return (url, resp.content, content_type) + except Exception as e: + print(f"Failed to download image {url[:80]}: {e}") + + return (url, None, None) + + +async def download_images(image_urls: list) -> dict: + """ + Download all images concurrently. + Returns dict mapping original URL -> (bytes, extension). + """ + results = {} + semaphore = asyncio.Semaphore(MAX_CONCURRENT_IMAGES) + + async with httpx.AsyncClient( + headers={ + "Referer": "https://mp.weixin.qq.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + follow_redirects=True, + ) as client: + tasks = [download_image(client, url, semaphore) for url in image_urls] + done = await asyncio.gather(*tasks) + + for url, data, content_type in done: + if data: + # Determine extension + ext = "jpg" + if content_type: + if "png" in content_type: + ext = "png" + elif "gif" in content_type: + ext = "gif" + elif "webp" in content_type: + ext = "webp" + elif "svg" in content_type: + ext = "svg" + + # Generate filename from URL hash + url_hash = hashlib.md5(url.encode()).hexdigest()[:12] + filename = f"{url_hash}.{ext}" + results[url] = (data, filename) + + return results + + +def create_markdown_zip(title: str, markdown: str, images: dict) -> io.BytesIO: + """ + Create a ZIP file containing: + - article.md (markdown with local image references) + - images/ directory with all downloaded images + """ + buf = io.BytesIO() + + # Sanitize title for directory name + safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article" + + with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf: + zf.writestr(f"{safe_title}/{safe_title}.md", markdown) + + for url, (data, filename) in images.items(): + zf.writestr(f"{safe_title}/images/{filename}", data) + + buf.seek(0) + return buf + + +def save_markdown_to_disk(output_dir: str, title: str, markdown: str, images: dict) -> str: + """ + Save markdown and images to disk. + Returns the path to the created directory. + """ + safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article" + article_dir = os.path.join(output_dir, safe_title) + images_dir = os.path.join(article_dir, "images") + os.makedirs(images_dir, exist_ok=True) + + # Write markdown + md_path = os.path.join(article_dir, f"{safe_title}.md") + with open(md_path, "w", encoding="utf-8") as f: + f.write(markdown) + + # Write images + for url, (data, filename) in images.items(): + img_path = os.path.join(images_dir, filename) + with open(img_path, "wb") as f: + f.write(data) + + return article_dir + + +@app.get("/health") +async def health(): + return {"status": "ok", "service": "scrapling"} + + +@app.post("/parse") +async def parse_html(body: dict): + """ + Parse WeChat article HTML and return markdown with images as a ZIP. + + Request body: + { + "html": "", + "download_images": true, // optional, default true + "output_format": "zip" // "zip" or "json" + } + """ + html = body.get("html", "") + if not html: + raise HTTPException(status_code=400, detail="html field is required") + + download_imgs = body.get("download_images", True) + output_format = body.get("output_format", "zip") + + # Parse the HTML + parsed = parse_wechat_html(html) + + if not parsed["content_html"]: + html_len = len(html) + html_preview = html[:500].replace('\n', ' ') + logger.error( + f"parse 422: content_html empty. " + f"html_len={html_len}, title='{parsed.get('title', '')}', " + f"html_preview='{html_preview}'" + ) + raise HTTPException(status_code=422, detail="Could not extract article content from HTML") + + # Download images if requested + image_map = {} + downloaded_images = {} + + if download_imgs and parsed["image_urls"]: + downloaded_images = await download_images(parsed["image_urls"]) + # Build image map: original URL -> local relative path + for url, (data, filename) in downloaded_images.items(): + image_map[url] = f"./images/{filename}" + + # Convert to markdown + markdown = html_to_markdown(parsed["content_html"], image_map) + + # Add frontmatter + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + if output_format == "json": + return JSONResponse({ + "title": parsed["title"], + "author": parsed["author"], + "account_name": parsed["account_name"], + "publish_time": parsed["publish_time"], + "markdown": full_markdown, + "image_count": len(downloaded_images), + "image_urls": parsed["image_urls"], + }) + + # Create ZIP + zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images) + + safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article" + # Use URL-encoded filename for non-ASCII compatibility + from urllib.parse import quote + encoded_title = quote(safe_title) + + return StreamingResponse( + zip_buf, + media_type="application/zip", + headers={ + "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip" + } + ) + + +@app.post("/parse-to-disk") +async def parse_to_disk(body: dict): + """ + Parse WeChat article HTML and save markdown + images to disk. + + Request body: + { + "html": "", + "output_dir": "/path/to/output" // optional, defaults to /tmp/wechat-articles + } + """ + html = body.get("html", "") + if not html: + raise HTTPException(status_code=400, detail="html field is required") + + output_dir = body.get("output_dir", "/tmp/wechat-articles") + + # Parse + parsed = parse_wechat_html(html) + if not parsed["content_html"]: + # Log diagnostic info for debugging + html_len = len(html) + html_preview = html[:500].replace('\n', ' ') + title_found = parsed.get("title", "") + logger.error( + f"parse-to-disk 422: content_html empty. " + f"html_len={html_len}, title='{title_found}', " + f"html_preview='{html_preview}'" + ) + raise HTTPException(status_code=422, detail="Could not extract article content") + + # Download images + downloaded_images = {} + image_map = {} + + if parsed["image_urls"]: + downloaded_images = await download_images(parsed["image_urls"]) + for url, (data, filename) in downloaded_images.items(): + image_map[url] = f"./images/{filename}" + + # Convert to markdown + markdown = html_to_markdown(parsed["content_html"], image_map) + + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + # Save to disk + article_dir = save_markdown_to_disk(output_dir, parsed["title"], full_markdown, downloaded_images) + + return { + "status": "ok", + "title": parsed["title"], + "path": article_dir, + "markdown_file": os.path.join(article_dir, f"{re.sub(r'[<>:\"/\\\\|?*]', '_', parsed['title'])[:80]}.md"), + "image_count": len(downloaded_images), + "total_images": len(parsed["image_urls"]), + } + + +@app.get("/fetch") +async def fetch_article( + url: str = Query(..., description="WeChat article URL"), + output_format: str = Query("zip", description="Output format: zip or json"), +): + """ + Fetch a WeChat article by URL, parse it, and return markdown. + Note: May fail due to WeChat captcha/anti-bot protection. + """ + try: + fetcher = Fetcher() + page = fetcher.get(url, headers={ + "Referer": "https://mp.weixin.qq.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + }) + html = page.html_content + except Exception as e: + raise HTTPException(status_code=502, detail=f"Failed to fetch article: {e}") + + # Check if we hit a captcha/verification page + if 'captcha' in html.lower() or 'verify' in html.lower(): + raise HTTPException( + status_code=403, + detail="WeChat captcha/verification detected. Use /parse endpoint with pre-fetched HTML instead." + ) + + parsed = parse_wechat_html(html) + if not parsed["content_html"]: + raise HTTPException(status_code=422, detail="Could not extract article content") + + downloaded_images = await download_images(parsed["image_urls"]) + image_map = {url: f"./images/{fn}" for url, (_, fn) in downloaded_images.items()} + + markdown = html_to_markdown(parsed["content_html"], image_map) + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + if output_format == "json": + return JSONResponse({ + "title": parsed["title"], + "markdown": full_markdown, + "image_count": len(downloaded_images), + }) + + zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images) + safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article" + from urllib.parse import quote + encoded_title = quote(safe_title) + + return StreamingResponse( + zip_buf, + media_type="application/zip", + headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"} + ) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8100) diff --git a/scrapling-service/main_simple.py b/scrapling-service/main_simple.py new file mode 100644 index 00000000..ddf1f6da --- /dev/null +++ b/scrapling-service/main_simple.py @@ -0,0 +1,461 @@ +""" +WeChat Article Scrapling Service +--------------------------------- +A FastAPI microservice that uses Scrapling to parse WeChat article HTML, +extract content, download images, and produce Markdown with local image references. + +Modes: +1. POST /parse - Takes raw HTML, parses it, downloads images, returns markdown + images as ZIP +2. GET /fetch - Takes a WeChat article URL, fetches it (may hit captcha), parses, returns markdown ZIP +3. POST /parse-markdown - Takes raw HTML, returns just the markdown text (images as original URLs) +""" + +import asyncio +import hashlib +import io +import os +import re +import tempfile +import time +import zipfile +from pathlib import Path +from typing import Optional +from urllib.parse import urljoin, urlparse + +import httpx +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import StreamingResponse, JSONResponse +from markdownify import markdownify as md +from scrapling import Fetcher + +app = FastAPI(title="WeChat Article Scrapling Service", version="1.0.0") + +# Image download timeout +IMAGE_TIMEOUT = 15 +# Max concurrent image downloads +MAX_CONCURRENT_IMAGES = 5 + + +def parse_wechat_html(html: str) -> dict: + """ + Use Scrapling to parse WeChat article HTML and extract structured content. + Returns dict with title, author, content_html, image_urls, publish_time. + """ + from scrapling.parser import Adaptor + + page = Adaptor(html, auto_match=False) + + result = { + "title": "", + "author": "", + "account_name": "", + "content_html": "", + "image_urls": [], + "publish_time": "", + } + + # Extract title + title_el = page.find("#activity-name") or page.find(".rich_media_title") + if title_el: + result["title"] = title_el.text.strip() + else: + title_el = page.find("title") + if title_el: + result["title"] = title_el.text.strip() + + # Extract author + author_el = page.find("#js_author_name") or page.find(".rich_media_meta_text") + if author_el: + result["author"] = author_el.text.strip() + + # Extract account name + account_el = page.find("#js_name") or page.find(".account_nickname_inner") + if account_el: + result["account_name"] = account_el.text.strip() + + # Extract publish time + pub_time_el = page.find("#publish_time") + if pub_time_el: + result["publish_time"] = pub_time_el.text.strip() + + # Extract main content + content_el = page.find("#js_content") + if content_el: + content_html = content_el.html_content + + # Process images: convert data-src to src + content_html = re.sub( + r']*?)data-src="([^"]+)"', + r']*?)data-original="([^"]+)"', + lambda m: m.group(0) if 'src="' in m.group(0) else f' str: + """ + Convert HTML content to Markdown, optionally replacing image URLs with local paths. + """ + if image_map: + for orig_url, local_path in image_map.items(): + content_html = content_html.replace(orig_url, local_path) + + # Clean up the HTML before conversion + # Remove empty spans, divs etc + content_html = re.sub(r'<(span|div|section|p)\s+style="[^"]*">\s*', '', content_html) + + # Convert to markdown + markdown = md( + content_html, + heading_style="atx", + bullets="-", + strip=["script", "style", "iframe", "noscript"], + ) + + # Clean up excessive whitespace + markdown = re.sub(r'\n{3,}', '\n\n', markdown) + markdown = markdown.strip() + + return markdown + + +async def download_image(client: httpx.AsyncClient, url: str, semaphore: asyncio.Semaphore) -> tuple: + """Download a single image. Returns (url, bytes, content_type) or (url, None, None) on failure.""" + async with semaphore: + try: + # Clean URL + clean_url = url.split("&")[0] if "&" in url else url + + resp = await client.get(clean_url, timeout=IMAGE_TIMEOUT, follow_redirects=True) + if resp.status_code == 200: + content_type = resp.headers.get("content-type", "image/jpeg") + return (url, resp.content, content_type) + except Exception as e: + print(f"Failed to download image {url[:80]}: {e}") + + return (url, None, None) + + +async def download_images(image_urls: list) -> dict: + """ + Download all images concurrently. + Returns dict mapping original URL -> (bytes, extension). + """ + results = {} + semaphore = asyncio.Semaphore(MAX_CONCURRENT_IMAGES) + + async with httpx.AsyncClient( + headers={ + "Referer": "https://mp.weixin.qq.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + }, + follow_redirects=True, + ) as client: + tasks = [download_image(client, url, semaphore) for url in image_urls] + done = await asyncio.gather(*tasks) + + for url, data, content_type in done: + if data: + # Determine extension + ext = "jpg" + if content_type: + if "png" in content_type: + ext = "png" + elif "gif" in content_type: + ext = "gif" + elif "webp" in content_type: + ext = "webp" + elif "svg" in content_type: + ext = "svg" + + # Generate filename from URL hash + url_hash = hashlib.md5(url.encode()).hexdigest()[:12] + filename = f"{url_hash}.{ext}" + results[url] = (data, filename) + + return results + + +def create_markdown_zip(title: str, markdown: str, images: dict) -> io.BytesIO: + """ + Create a ZIP file containing: + - article.md (markdown with local image references) + - images/ directory with all downloaded images + """ + buf = io.BytesIO() + + # Sanitize title for directory name + safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article" + + with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf: + zf.writestr(f"{safe_title}/{safe_title}.md", markdown) + + for url, (data, filename) in images.items(): + zf.writestr(f"{safe_title}/images/{filename}", data) + + buf.seek(0) + return buf + + +def save_markdown_to_disk(output_dir: str, title: str, markdown: str, images: dict) -> str: + """ + Save markdown and images to disk. + Returns the path to the created directory. + """ + safe_title = re.sub(r'[<>:"/\\|?*]', '_', title)[:80] or "article" + article_dir = os.path.join(output_dir, safe_title) + images_dir = os.path.join(article_dir, "images") + os.makedirs(images_dir, exist_ok=True) + + # Write markdown + md_path = os.path.join(article_dir, f"{safe_title}.md") + with open(md_path, "w", encoding="utf-8") as f: + f.write(markdown) + + # Write images + for url, (data, filename) in images.items(): + img_path = os.path.join(images_dir, filename) + with open(img_path, "wb") as f: + f.write(data) + + return article_dir + + +@app.get("/health") +async def health(): + return {"status": "ok", "service": "scrapling"} + + +@app.post("/parse") +async def parse_html(body: dict): + """ + Parse WeChat article HTML and return markdown with images as a ZIP. + + Request body: + { + "html": "", + "download_images": true, // optional, default true + "output_format": "zip" // "zip" or "json" + } + """ + html = body.get("html", "") + if not html: + raise HTTPException(status_code=400, detail="html field is required") + + download_imgs = body.get("download_images", True) + output_format = body.get("output_format", "zip") + + # Parse the HTML + parsed = parse_wechat_html(html) + + if not parsed["content_html"]: + raise HTTPException(status_code=422, detail="Could not extract article content from HTML") + + # Download images if requested + image_map = {} + downloaded_images = {} + + if download_imgs and parsed["image_urls"]: + downloaded_images = await download_images(parsed["image_urls"]) + # Build image map: original URL -> local relative path + for url, (data, filename) in downloaded_images.items(): + image_map[url] = f"./images/{filename}" + + # Convert to markdown + markdown = html_to_markdown(parsed["content_html"], image_map) + + # Add frontmatter + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + if output_format == "json": + return JSONResponse({ + "title": parsed["title"], + "author": parsed["author"], + "account_name": parsed["account_name"], + "publish_time": parsed["publish_time"], + "markdown": full_markdown, + "image_count": len(downloaded_images), + "image_urls": parsed["image_urls"], + }) + + # Create ZIP + zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images) + + safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article" + # Use URL-encoded filename for non-ASCII compatibility + from urllib.parse import quote + encoded_title = quote(safe_title) + + return StreamingResponse( + zip_buf, + media_type="application/zip", + headers={ + "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip" + } + ) + + +@app.post("/parse-to-disk") +async def parse_to_disk(body: dict): + """ + Parse WeChat article HTML and save markdown + images to disk. + + Request body: + { + "html": "", + "output_dir": "/path/to/output" // optional, defaults to /tmp/wechat-articles + } + """ + html = body.get("html", "") + if not html: + raise HTTPException(status_code=400, detail="html field is required") + + output_dir = body.get("output_dir", "/tmp/wechat-articles") + + # Parse + parsed = parse_wechat_html(html) + if not parsed["content_html"]: + raise HTTPException(status_code=422, detail="Could not extract article content") + + # Download images + downloaded_images = {} + image_map = {} + + if parsed["image_urls"]: + downloaded_images = await download_images(parsed["image_urls"]) + for url, (data, filename) in downloaded_images.items(): + image_map[url] = f"./images/{filename}" + + # Convert to markdown + markdown = html_to_markdown(parsed["content_html"], image_map) + + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + # Save to disk + article_dir = save_markdown_to_disk(output_dir, parsed["title"], full_markdown, downloaded_images) + + return { + "status": "ok", + "title": parsed["title"], + "path": article_dir, + "markdown_file": os.path.join(article_dir, f"{re.sub(r'[<>:\"/\\\\|?*]', '_', parsed['title'])[:80]}.md"), + "image_count": len(downloaded_images), + "total_images": len(parsed["image_urls"]), + } + + +@app.get("/fetch") +async def fetch_article( + url: str = Query(..., description="WeChat article URL"), + output_format: str = Query("zip", description="Output format: zip or json"), +): + """ + Fetch a WeChat article by URL, parse it, and return markdown. + Note: May fail due to WeChat captcha/anti-bot protection. + """ + try: + fetcher = Fetcher() + page = fetcher.get(url, headers={ + "Referer": "https://mp.weixin.qq.com/", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + }) + html = page.html_content + except Exception as e: + raise HTTPException(status_code=502, detail=f"Failed to fetch article: {e}") + + # Check if we hit a captcha/verification page + if 'captcha' in html.lower() or 'verify' in html.lower(): + raise HTTPException( + status_code=403, + detail="WeChat captcha/verification detected. Use /parse endpoint with pre-fetched HTML instead." + ) + + parsed = parse_wechat_html(html) + if not parsed["content_html"]: + raise HTTPException(status_code=422, detail="Could not extract article content") + + downloaded_images = await download_images(parsed["image_urls"]) + image_map = {url: f"./images/{fn}" for url, (_, fn) in downloaded_images.items()} + + markdown = html_to_markdown(parsed["content_html"], image_map) + frontmatter = f"""--- +title: "{parsed['title']}" +author: "{parsed['author']}" +account: "{parsed['account_name']}" +date: "{parsed['publish_time']}" +--- + +""" + full_markdown = frontmatter + markdown + + if output_format == "json": + return JSONResponse({ + "title": parsed["title"], + "markdown": full_markdown, + "image_count": len(downloaded_images), + }) + + zip_buf = create_markdown_zip(parsed["title"], full_markdown, downloaded_images) + safe_title = re.sub(r'[<>:"/\\|?*]', '_', parsed["title"])[:80] or "article" + from urllib.parse import quote + encoded_title = quote(safe_title) + + return StreamingResponse( + zip_buf, + media_type="application/zip", + headers={"Content-Disposition": f"attachment; filename*=UTF-8''{encoded_title}.zip"} + ) + + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=8100) diff --git a/scrapling-service/requirements.txt b/scrapling-service/requirements.txt new file mode 100644 index 00000000..0d7c94d8 --- /dev/null +++ b/scrapling-service/requirements.txt @@ -0,0 +1,9 @@ +scrapling>=0.4.0 +fastapi>=0.100.0 +uvicorn>=0.20.0 +markdownify>=0.12.0 +httpx>=0.24.0 +Pillow>=10.0.0 +curl-cffi>=0.5.0 +playwright>=1.40.0 +browserforge>=1.1.0 diff --git a/server/api/public/v1/batch-download.post.ts b/server/api/public/v1/batch-download.post.ts new file mode 100644 index 00000000..bacc7388 --- /dev/null +++ b/server/api/public/v1/batch-download.post.ts @@ -0,0 +1,105 @@ +/** + * Batch download endpoint using Scrapling service. + * + * POST /api/public/v1/batch-download + * + * Body: + * urls – array of WeChat article URLs (required, max 50) + * format – markdown-zip | markdown-json (default: markdown-zip) + * output_dir – directory to save files (optional, for server-side output) + * + * This endpoint fetches each article HTML, sends to Scrapling for parsing, + * and returns a single ZIP containing all articles with images. + */ + +import { urlIsValidMpArticle } from '#shared/utils'; +import { USER_AGENT } from '~/config'; + +const SCRAPLING_URL = process.env.SCRAPLING_SERVICE_URL || 'http://localhost:8100'; + +export default defineEventHandler(async event => { + const body = await readBody(event); + + if (!body.urls || !Array.isArray(body.urls) || body.urls.length === 0) { + return { + base_resp: { + ret: -1, + err_msg: 'urls数组不能为空', + }, + }; + } + + if (body.urls.length > 50) { + return { + base_resp: { + ret: -1, + err_msg: '每次最多处理50篇文章', + }, + }; + } + + const format: string = (body.format || 'markdown-zip').toLowerCase(); + const outputDir: string = body.output_dir || '/tmp/wechat-articles'; + const results: any[] = []; + const errors: any[] = []; + + for (let i = 0; i < body.urls.length; i++) { + const url = decodeURIComponent(body.urls[i].trim()); + + if (!urlIsValidMpArticle(url)) { + errors.push({ url, error: 'url不合法' }); + continue; + } + + try { + // 1. Fetch article HTML with proper headers to avoid WeChat anti-bot + const rawHtml = await fetch(url, { + headers: { + Referer: 'https://mp.weixin.qq.com/', + Origin: 'https://mp.weixin.qq.com', + 'User-Agent': USER_AGENT, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + }, + redirect: 'follow', + }).then(res => res.text()); + + // 2. Send to Scrapling service for parsing and saving to disk + const scraplingResp = await fetch(`${SCRAPLING_URL}/parse-to-disk`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + html: rawHtml, + output_dir: outputDir, + }), + }); + + if (scraplingResp.ok) { + const data = await scraplingResp.json() as any; + results.push({ + url, + title: data.title, + path: data.path, + markdown_file: data.markdown_file, + image_count: data.image_count, + total_images: data.total_images, + }); + } else { + const errorText = await scraplingResp.text(); + errors.push({ url, error: `Scrapling error: ${errorText}` }); + } + } catch (error: any) { + errors.push({ url, error: error.message }); + } + } + + return { + base_resp: { ret: 0 }, + total: body.urls.length, + success: results.length, + failed: errors.length, + results, + errors, + output_dir: outputDir, + }; +}); diff --git a/server/api/public/v1/download.get.ts b/server/api/public/v1/download.get.ts index 0716b7a6..ff460442 100644 --- a/server/api/public/v1/download.get.ts +++ b/server/api/public/v1/download.get.ts @@ -3,13 +3,27 @@ import { urlIsValidMpArticle } from '#shared/utils'; import { normalizeHtml, parseCgiDataNew } from '#shared/utils/html'; import { USER_AGENT } from '~/config'; -interface SearchBizQuery { +/** + * Enhanced download endpoint using Scrapling service for markdown+images. + * + * Query params: + * url – WeChat article URL (required) + * format – html | markdown | text | json | markdown-zip (default: html) + * + * `markdown-zip` format delegates to the Scrapling Python sidecar + * and returns a ZIP containing `.md` + `images/` folder. + */ + +interface DownloadQuery { url: string; format: string; } +// Scrapling service URL – can be set via env or defaults to localhost +const SCRAPLING_URL = process.env.SCRAPLING_SERVICE_URL || 'http://localhost:8100'; + export default defineEventHandler(async event => { - const query = getQuery<SearchBizQuery>(event); + const query = getQuery<DownloadQuery>(event); if (!query.url) { return { base_resp: { @@ -30,15 +44,16 @@ export default defineEventHandler(async event => { } const format: string = (query.format || 'html').toLowerCase(); - if (!['html', 'markdown', 'text', 'json'].includes(format)) { + if (!['html', 'markdown', 'text', 'json', 'markdown-zip'].includes(format)) { return { base_resp: { ret: -1, - err_msg: '不支持的format', + err_msg: '不支持的format,可选: html, markdown, text, json, markdown-zip', }, }; } + // Fetch the raw HTML from WeChat const rawHtml = await fetch(url, { headers: { Referer: 'https://mp.weixin.qq.com/', @@ -47,6 +62,88 @@ export default defineEventHandler(async event => { }, }).then(res => res.text()); + // For markdown-zip: delegate to Scrapling service + if (format === 'markdown-zip') { + try { + const scraplingResp = await fetch(`${SCRAPLING_URL}/parse`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + html: rawHtml, + download_images: true, + output_format: 'zip', + }), + }); + + if (!scraplingResp.ok) { + const errorBody = await scraplingResp.text(); + return { + base_resp: { + ret: -2, + err_msg: `Scrapling service error: ${errorBody}`, + }, + }; + } + + // Stream the ZIP response back + const zipBuffer = await scraplingResp.arrayBuffer(); + const contentDisposition = scraplingResp.headers.get('content-disposition') || 'attachment; filename="article.zip"'; + + return new Response(zipBuffer, { + status: 200, + headers: { + 'Content-Type': 'application/zip', + 'Content-Disposition': contentDisposition, + }, + }); + } catch (error: any) { + return { + base_resp: { + ret: -3, + err_msg: `Failed to connect to Scrapling service at ${SCRAPLING_URL}: ${error.message}`, + }, + }; + } + } + + // For markdown: try Scrapling first, fallback to basic conversion + if (format === 'markdown') { + try { + const scraplingResp = await fetch(`${SCRAPLING_URL}/parse`, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + html: rawHtml, + download_images: false, + output_format: 'json', + }), + }); + + if (scraplingResp.ok) { + const data = await scraplingResp.json() as any; + if (data.markdown) { + return new Response(data.markdown, { + status: 200, + headers: { + 'Content-Type': 'text/markdown; charset=UTF-8', + }, + }); + } + } + } catch { + // Scrapling service unavailable, fallback to basic conversion + } + + // Fallback: basic TurndownService conversion + return new Response(new TurndownService().turndown(normalizeHtml(rawHtml, 'html')), { + status: 200, + headers: { + 'Content-Type': 'text/markdown; charset=UTF-8', + }, + }); + } + + // Original formats switch (format) { case 'html': return new Response(normalizeHtml(rawHtml, 'html'), { @@ -62,13 +159,6 @@ export default defineEventHandler(async event => { 'Content-Type': 'text/plain; charset=UTF-8', }, }); - case 'markdown': - return new Response(new TurndownService().turndown(normalizeHtml(rawHtml, 'html')), { - status: 200, - headers: { - 'Content-Type': 'text/markdown; charset=UTF-8', - }, - }); case 'json': return await parseCgiDataNew(rawHtml); default: