Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ dist

.nuxt
.nuxt-*
.output
# .output # temporarily allowed for local runtime build
.gen
.yarn/cache # Yarn 缓存
yarn-error.log
Expand Down
20 changes: 20 additions & 0 deletions Dockerfile.runtime
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Runtime-only Dockerfile: uses pre-built .output from host
FROM node:22-alpine

LABEL maintainer="findsource@proton.me" \
description="wechat-article-exporter Docker Image (local build)"

WORKDIR /app

# Copy pre-built output from host
COPY .output ./

# Create KV storage directory and set permissions
RUN mkdir -p .data/kv && chown -R node:node /app

USER node
EXPOSE 3000

ENV NODE_ENV=production HOST=0.0.0.0 PORT=3000

ENTRYPOINT ["node", "server/index.mjs"]
33 changes: 33 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
services:
# Main Nuxt/Nitro application
web:
image: wechat-article-exporter-web:local
build:
context: .
dockerfile: Dockerfile.runtime
ports:
- "3000:3000"
environment:
- NODE_ENV=production
- HOST=0.0.0.0
- PORT=3000
- SCRAPLING_SERVICE_URL=http://scrapling:8100
volumes:
- ./data/kv:/app/.data/kv
- ./data/articles:/tmp/wechat-articles
depends_on:
- scrapling
restart: unless-stopped

# Scrapling Python service for article parsing
scrapling:
build:
context: ./scrapling-service
dockerfile: Dockerfile
ports:
- "8100:8100"
environment:
- PYTHONUNBUFFERED=1
volumes:
- ./data/articles:/tmp/wechat-articles
restart: unless-stopped
20 changes: 20 additions & 0 deletions scrapling-service/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
FROM python:3.12-slim

WORKDIR /app

# Install system dependencies for Scrapling
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
&& rm -rf /var/lib/apt/lists/*

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY main.py .

# Create output directory
RUN mkdir -p /tmp/wechat-articles

EXPOSE 8100

CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8100"]
117 changes: 117 additions & 0 deletions scrapling-service/export_articles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
"""
WeChat Article Batch Export Script
===================================
Takes article HTML files and exports them to markdown + images
using the Scrapling service.

Usage:
# Export a single article HTML file
python3 export_articles.py /path/to/article.html

# Export multiple HTML files
python3 export_articles.py /path/to/*.html

# Specify output directory
python3 export_articles.py --output /path/to/output /path/to/*.html

# Use ZIP format (outputs individual ZIPs per article)
python3 export_articles.py --format zip /path/to/*.html

Requirements:
- Scrapling service running at http://localhost:8100
"""

import argparse
import os
import sys
import json

try:
import httpx
except ImportError:
print("Error: httpx not installed. Run: pip install httpx")
sys.exit(1)


def export_article(html_path: str, output_dir: str, scrapling_url: str, fmt: str = "disk") -> dict:
"""Export a single article HTML file."""
with open(html_path, "r", encoding="utf-8") as f:
html = f.read()

if fmt == "disk":
resp = httpx.post(f"{scrapling_url}/parse-to-disk", json={
"html": html,
"output_dir": output_dir,
}, timeout=120)
elif fmt == "zip":
resp = httpx.post(f"{scrapling_url}/parse", json={
"html": html,
"download_images": True,
"output_format": "zip",
}, timeout=120)

if resp.status_code == 200:
# Save ZIP
filename = os.path.splitext(os.path.basename(html_path))[0] + ".zip"
zip_path = os.path.join(output_dir, filename)
os.makedirs(output_dir, exist_ok=True)
with open(zip_path, "wb") as f:
f.write(resp.content)
return {"status": "ok", "path": zip_path, "size": len(resp.content)}

if resp.status_code == 200:
return resp.json()
else:
return {"status": "error", "code": resp.status_code, "detail": resp.text}


def main():
parser = argparse.ArgumentParser(description="Export WeChat article HTML to Markdown + Images")
parser.add_argument("files", nargs="+", help="HTML files to process")
parser.add_argument("--output", "-o", default="/tmp/wechat-articles", help="Output directory")
parser.add_argument("--format", "-f", choices=["disk", "zip"], default="disk", help="Output format")
parser.add_argument("--scrapling-url", default="http://localhost:8100", help="Scrapling service URL")
args = parser.parse_args()

# Check service health
try:
health = httpx.get(f"{args.scrapling_url}/health", timeout=5)
if health.status_code != 200:
print(f"Error: Scrapling service at {args.scrapling_url} is not healthy")
sys.exit(1)
except Exception as e:
print(f"Error: Cannot connect to Scrapling service at {args.scrapling_url}: {e}")
sys.exit(1)

print(f"Output directory: {args.output}")
print(f"Format: {args.format}")
print(f"Files to process: {len(args.files)}")
print()

success = 0
failed = 0

for i, filepath in enumerate(args.files):
if not os.path.exists(filepath):
print(f"[{i+1}/{len(args.files)}] ✗ File not found: {filepath}")
failed += 1
continue

result = export_article(filepath, args.output, args.scrapling_url, args.format)

if result.get("status") == "ok":
title = result.get("title", os.path.basename(filepath))
images = f"{result.get('image_count', '?')}/{result.get('total_images', '?')}"
print(f"[{i+1}/{len(args.files)}] ✓ {title[:60]} ({images} images)")
success += 1
else:
print(f"[{i+1}/{len(args.files)}] ✗ {filepath}: {result.get('detail', 'Unknown error')[:100]}")
failed += 1

print(f"\nDone: {success} success, {failed} failed")
print(f"Output: {args.output}")


if __name__ == "__main__":
main()
Loading