Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,18 @@ COPY superscrape/ superscrape/
COPY api/ api/

# Install Python deps (non-editable for Docker)
RUN pip install --no-cache-dir .
RUN pip install --no-cache-dir ".[api]"

# Download Camoufox browser binary
RUN python -c "from camoufox.sync_api import Camoufox; print('Camoufox binary ready')" || true
# Download Camoufox browser binary (FF135 from daijro/camoufox releases)
# Also install Playwright system deps for the bundled Firefox
RUN python -m camoufox fetch && \
python -c "import camoufox, glob, os; d=os.path.dirname(camoufox.__file__); print('Binary:', glob.glob(f'{d}/**/firefox*', recursive=True)[:3])"

EXPOSE 8001

HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD curl -f http://localhost:8001/health || exit 1

# xvfb-run provides virtual display for headless Camoufox
CMD xvfb-run --auto-servernum --server-args="-screen 0 1280x720x24" \
uvicorn api.main:app --host 0.0.0.0 --port 8001
# Camoufox headless='virtual' manages its own Xvfb internally.
# Do NOT wrap with xvfb-run — dual Xvfb causes display conflicts.
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8001"]
4 changes: 4 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

from api.routes.benchmarks import router as benchmarks_router
from api.routes.exports import router as exports_router
from api.routes.jobs import router as jobs_router
from api.routes.listing import router as listing_router
from api.routes.monitor import router as monitor_router
from api.routes.reports import router as reports_router
from api.routes.uploads import router as uploads_router
Expand Down Expand Up @@ -61,10 +63,12 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:

# ── Routers ───────────────────────────────────────────────────────────────────
app.include_router(jobs_router)
app.include_router(listing_router)
app.include_router(reports_router)
app.include_router(uploads_router)
app.include_router(exports_router)
app.include_router(monitor_router)
app.include_router(benchmarks_router)


@app.get("/health", tags=["meta"])
Expand Down
124 changes: 124 additions & 0 deletions api/reports/html_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -514,12 +514,132 @@ def _build_lifestyle_section(
# ── Main Builder ──────────────────────────────────────────────────────────────


def _build_benchmark_section(
benchmark: object,
report: "CategoryVisualReport",
lang: str = "en",
) -> str:
"""Build the Category Benchmark comparison section."""
b = benchmark if isinstance(benchmark, dict) else (benchmark.model_dump() if hasattr(benchmark, "model_dump") else {})
if not b or not b.get("total_products"):
return ""

is_zh = lang == "zh"
title = "品类基准线 (80K 图片数据)" if is_zh else "Category Benchmark (80K Image Dataset)"
vs_label = "你的竞品" if is_zh else "Your Competitors"
bench_label = "品类基准" if is_zh else "Category Benchmark"

# ── Killer stat cards ──
missing_slots = b.get("missing_slots_ranking", {})
top_missing = list(missing_slots.items())[:3]
aplus_rate = b.get("aplus_adoption_rate", 0)
aplus_score = b.get("aplus_avg_score", 0)
quality_avg = b.get("quality_avg", 0)

cards_html = '<div class="kpi-grid">'
cards_html += f'''<div class="kpi-card">
<div class="kpi-value">{b.get("total_products", 0):,}</div>
<div class="kpi-label">{"基准产品数" if is_zh else "Benchmark Products"}</div>
</div>'''
cards_html += f'''<div class="kpi-card">
<div class="kpi-value">{b.get("total_images", 0):,}</div>
<div class="kpi-label">{"基准图片数" if is_zh else "Benchmark Images"}</div>
</div>'''
cards_html += f'''<div class="kpi-card">
<div class="kpi-value">{aplus_rate:.0f}%</div>
<div class="kpi-label">{"有A+内容" if is_zh else "Have A+ Content"}</div>
</div>'''
cards_html += f'''<div class="kpi-card">
<div class="kpi-value">{quality_avg:.1f}/5</div>
<div class="kpi-label">{"平均图片质量" if is_zh else "Avg Image Quality"}</div>
</div>'''
cards_html += '</div>'

# ── Missing slots (opportunities) ──
opp_title = "最大机会点" if is_zh else "Biggest Opportunities"
opp_html = f'<h3 style="font-size:13px;font-weight:600;margin:16px 0 10px;color:#374151;">{opp_title}</h3>'
opp_html += '<div style="display:flex;gap:10px;flex-wrap:wrap;">'
for slot, pct in top_missing:
label = f"{pct:.0f}% {'缺少' if is_zh else 'missing'} {slot}"
opp_html += f'''<div style="flex:1;min-width:180px;background:#FEF3C7;border:1px solid #FDE68A;border-radius:8px;padding:12px;">
<div style="font-size:22px;font-weight:700;color:#D97706;">{pct:.0f}%</div>
<div style="font-size:11px;color:#92400E;margin-top:2px;">{"缺少" if is_zh else "missing"} <strong>{slot}</strong></div>
</div>'''
opp_html += '</div>'

# ── Image type comparison bars ──
comp_title = "图片类型对比" if is_zh else "Image Type Comparison"
bench_dist = b.get("image_type_distribution", {})
report_dist = report.image_type_distribution if report else {}

bars_html = f'<h3 style="font-size:13px;font-weight:600;margin:20px 0 10px;color:#374151;">{comp_title}</h3>'
bars_html += f'''<div style="display:flex;gap:6px;font-size:10px;margin-bottom:8px;color:#6B7280;">
<span style="display:inline-block;width:12px;height:12px;background:#2563EB;border-radius:2px;"></span> {vs_label}
<span style="margin-left:12px;display:inline-block;width:12px;height:12px;background:#D1D5DB;border-radius:2px;"></span> {bench_label}
</div>'''

all_types = sorted(set(list(bench_dist.keys()) + list(report_dist.keys())))
for img_type in all_types:
yours = report_dist.get(img_type, 0)
theirs = bench_dist.get(img_type, 0)
delta = yours - theirs
delta_color = "#10B981" if delta > 0 else "#EF4444" if delta < 0 else "#6B7280"
delta_sign = "+" if delta > 0 else ""

bars_html += f'''<div style="margin-bottom:6px;">
<div style="display:flex;align-items:center;gap:6px;margin-bottom:2px;">
<span style="width:90px;font-size:11px;font-weight:500;color:#374151;text-align:right;">{img_type}</span>
<div style="flex:1;display:flex;align-items:center;gap:4px;">
<div style="height:6px;background:#2563EB;border-radius:3px;width:{min(yours, 100) * 0.8}%;"></div>
<span style="font-size:10px;color:#2563EB;">{yours:.0f}%</span>
</div>
</div>
<div style="display:flex;align-items:center;gap:6px;">
<span style="width:90px;"></span>
<div style="flex:1;display:flex;align-items:center;gap:4px;">
<div style="height:6px;background:#D1D5DB;border-radius:3px;width:{min(theirs, 100) * 0.8}%;"></div>
<span style="font-size:10px;color:#6B7280;">{theirs:.0f}%</span>
<span style="font-size:10px;font-weight:600;color:{delta_color};">{delta_sign}{delta:.0f}%</span>
</div>
</div>
</div>'''

# ── Price tier distribution ──
price_dist = b.get("price_tier_distribution", {})
price_html = ""
if price_dist:
price_title = "价格定位分布" if is_zh else "Price Tier Distribution"
price_html = f'<h3 style="font-size:13px;font-weight:600;margin:20px 0 10px;color:#374151;">{price_title}</h3>'
price_html += '<div style="display:flex;gap:8px;">'
tier_colors = {"budget": "#10B981", "mid_range": "#2563EB", "premium": "#7C3AED", "luxury": "#D97706"}
for tier, pct in price_dist.items():
color = tier_colors.get(tier, "#6B7280")
price_html += f'''<div style="flex:1;text-align:center;background:#F9FAFB;border:1px solid #E5E7EB;border-radius:8px;padding:10px;">
<div style="font-size:20px;font-weight:700;color:{color};">{pct:.0f}%</div>
<div style="font-size:10px;color:#6B7280;margin-top:2px;">{tier.replace("_", " ").title()}</div>
</div>'''
price_html += '</div>'

return f'''
<div class="card section-break">
<h2 style="display:flex;align-items:center;gap:8px;">
<span style="font-size:18px;">📊</span> {title}
</h2>
{cards_html}
{opp_html}
{bars_html}
{price_html}
</div>
'''


def build_html_report(
report: "CategoryVisualReport",
products: list["ScrapedItem"],
analyses: list["ImageAnalysis"],
lifestyle_images: list[str],
language: Language = Language.en,
benchmark: object | None = None,
) -> str:
"""Build a self-contained, bilingual HTML report.

Expand All @@ -545,6 +665,7 @@ def build_html_report(
rec_section = _build_recommendations(report, lbl)
product_grid = _build_product_grid(report, products, lbl, platform)
lifestyle_section = _build_lifestyle_section(lifestyle_images, lbl)
benchmark_section = _build_benchmark_section(benchmark, report, lang) if benchmark else ""

css = _get_css()

Expand Down Expand Up @@ -587,6 +708,9 @@ def build_html_report(
<!-- Distribution Charts -->
{dist_section}

<!-- Category Benchmark -->
{benchmark_section}

<!-- Recommendations -->
{rec_section}

Expand Down
48 changes: 48 additions & 0 deletions api/routes/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Benchmark API routes — category-level visual intelligence from 80K images."""

from __future__ import annotations

from fastapi import APIRouter, Query

from api.services.benchmark_service import (
get_benchmark,
get_categories,
get_reverse_prompts,
get_top_gaps,
)

router = APIRouter(prefix="/benchmarks", tags=["benchmarks"])


@router.get("/categories")
async def list_categories() -> list[dict]:
"""List all available benchmark categories with product/image counts."""
return get_categories()


@router.get("/{category}")
async def category_benchmark(category: str) -> dict:
"""Get full benchmark data for a category (matched via keyword)."""
bench = get_benchmark(category)
if bench is None:
return {"error": f"No benchmark data found for '{category}'"}
return bench.model_dump()


@router.get("/{category}/top-gaps")
async def category_top_gaps(
category: str,
limit: int = Query(default=5, ge=1, le=20),
) -> list[dict]:
"""Get the biggest opportunity gaps for a category."""
return get_top_gaps(category, limit=limit)


@router.get("/{category}/reverse-prompts")
async def category_reverse_prompts(
category: str,
type: str = Query(default="", description="Filter by image type (e.g. lifestyle, model)"),
limit: int = Query(default=10, ge=1, le=50),
) -> list[dict]:
"""Get reference reverse prompts for a category."""
return get_reverse_prompts(category, image_type=type, limit=limit)
4 changes: 4 additions & 0 deletions api/routes/exports.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ async def export_job(job_id: str) -> StreamingResponse:
if hasattr(data, "listing_text") and data.listing_text:
zf.writestr("listing_text.json", json.dumps(data.listing_text, indent=2, ensure_ascii=False))

# Benchmark data
if hasattr(data, "benchmark") and data.benchmark:
zf.writestr("benchmark.json", json.dumps(data.benchmark, indent=2, ensure_ascii=False))

buf.seek(0)

keyword = ""
Expand Down
1 change: 1 addition & 0 deletions api/routes/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ async def get_job_data(job_id: str) -> dict:
"seasonal_alerts": data.seasonal_alerts,
"review_insights": data.review_insights,
"story_arc": data.story_arc,
"benchmark": data.benchmark,
}


Expand Down
78 changes: 78 additions & 0 deletions api/routes/listing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Listing import routes: import pre-generated listing-gen projects."""

from __future__ import annotations

import json
import logging
import os
import shutil
from pathlib import Path

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/listing", tags=["listing"])

_REPORTS_DIR = Path(os.environ.get("REPORTS_DIR", "/tmp/superscrape_reports"))


class ImportRequest(BaseModel):
project_dir: str


@router.post("/jobs/{job_id}/import")
async def import_listing_project(job_id: str, req: ImportRequest) -> dict:
"""Import an existing listing-gen project into a superscrape job.

Reads brief.json, listing_text.json, and copies generated images
so they're accessible via the standard reports/static routes.
"""
src = Path(req.project_dir)
if not src.exists():
raise HTTPException(status_code=404, detail=f"Project directory not found: {req.project_dir}")

output_dir = src / "output"

# Read brief.json
brief: dict = {}
brief_path = src / "brief.json"
if brief_path.exists():
brief = json.loads(brief_path.read_text(encoding="utf-8"))

# Read listing_text.json
listing_text: dict = {}
lt_path = output_dir / "listing_text.json"
if lt_path.exists():
listing_text = json.loads(lt_path.read_text(encoding="utf-8"))

# Read compliance_report.json
compliance: dict = {}
cr_path = output_dir / "compliance_report.json"
if cr_path.exists():
compliance = json.loads(cr_path.read_text(encoding="utf-8"))

# Copy generated images to REPORTS_DIR/{job_id}/images/
images_dest = _REPORTS_DIR / job_id / "images"
images_dest.mkdir(parents=True, exist_ok=True)

image_paths: list[dict[str, str]] = []
if output_dir.exists():
for img_file in sorted(output_dir.glob("slot_*.png")):
dst = images_dest / img_file.name
shutil.copy2(img_file, dst)
image_paths.append({
"filename": img_file.name,
"path": f"/reports/{job_id}/images/{img_file.name}",
})

logger.info("Imported listing project %s → job %s (%d images)", req.project_dir, job_id, len(image_paths))

return {
"job_id": job_id,
"brief": brief,
"listing_text": listing_text,
"compliance": compliance,
"images": image_paths,
"images_count": len(image_paths),
}
Loading
Loading