diff --git a/Dockerfile b/Dockerfile
index 5ec52ae..5e16884 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -32,16 +32,18 @@ COPY superscrape/ superscrape/
COPY api/ api/
# Install Python deps (non-editable for Docker)
-RUN pip install --no-cache-dir .
+RUN pip install --no-cache-dir ".[api]"
-# Download Camoufox browser binary
-RUN python -c "from camoufox.sync_api import Camoufox; print('Camoufox binary ready')" || true
+# Download Camoufox browser binary (FF135 from daijro/camoufox releases)
+# Also install Playwright system deps for the bundled Firefox
+RUN python -m camoufox fetch && \
+ python -c "import camoufox, glob, os; d=os.path.dirname(camoufox.__file__); print('Binary:', glob.glob(f'{d}/**/firefox*', recursive=True)[:3])"
EXPOSE 8001
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
CMD curl -f http://localhost:8001/health || exit 1
-# xvfb-run provides virtual display for headless Camoufox
-CMD xvfb-run --auto-servernum --server-args="-screen 0 1280x720x24" \
- uvicorn api.main:app --host 0.0.0.0 --port 8001
+# Camoufox headless='virtual' manages its own Xvfb internally.
+# Do NOT wrap with xvfb-run — dual Xvfb causes display conflicts.
+CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8001"]
diff --git a/api/main.py b/api/main.py
index d75fe71..6eb66e1 100644
--- a/api/main.py
+++ b/api/main.py
@@ -14,8 +14,10 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
+from api.routes.benchmarks import router as benchmarks_router
from api.routes.exports import router as exports_router
from api.routes.jobs import router as jobs_router
+from api.routes.listing import router as listing_router
from api.routes.monitor import router as monitor_router
from api.routes.reports import router as reports_router
from api.routes.uploads import router as uploads_router
@@ -61,10 +63,12 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
# ── Routers ───────────────────────────────────────────────────────────────────
app.include_router(jobs_router)
+app.include_router(listing_router)
app.include_router(reports_router)
app.include_router(uploads_router)
app.include_router(exports_router)
app.include_router(monitor_router)
+app.include_router(benchmarks_router)
@app.get("/health", tags=["meta"])
diff --git a/api/reports/html_builder.py b/api/reports/html_builder.py
index 643be24..7e28422 100644
--- a/api/reports/html_builder.py
+++ b/api/reports/html_builder.py
@@ -514,12 +514,132 @@ def _build_lifestyle_section(
# ── Main Builder ──────────────────────────────────────────────────────────────
+def _build_benchmark_section(
+ benchmark: object,
+ report: "CategoryVisualReport",
+ lang: str = "en",
+) -> str:
+ """Build the Category Benchmark comparison section."""
+ b = benchmark if isinstance(benchmark, dict) else (benchmark.model_dump() if hasattr(benchmark, "model_dump") else {})
+ if not b or not b.get("total_products"):
+ return ""
+
+ is_zh = lang == "zh"
+ title = "品类基准线 (80K 图片数据)" if is_zh else "Category Benchmark (80K Image Dataset)"
+ vs_label = "你的竞品" if is_zh else "Your Competitors"
+ bench_label = "品类基准" if is_zh else "Category Benchmark"
+
+ # ── Killer stat cards ──
+ missing_slots = b.get("missing_slots_ranking", {})
+ top_missing = list(missing_slots.items())[:3]
+ aplus_rate = b.get("aplus_adoption_rate", 0)
+ aplus_score = b.get("aplus_avg_score", 0)
+ quality_avg = b.get("quality_avg", 0)
+
+ cards_html = '
'
+ cards_html += f'''
+
{b.get("total_products", 0):,}
+
{"基准产品数" if is_zh else "Benchmark Products"}
+
'''
+ cards_html += f'''
+
{b.get("total_images", 0):,}
+
{"基准图片数" if is_zh else "Benchmark Images"}
+
'''
+ cards_html += f'''
+
{aplus_rate:.0f}%
+
{"有A+内容" if is_zh else "Have A+ Content"}
+
'''
+ cards_html += f'''
+
{quality_avg:.1f}/5
+
{"平均图片质量" if is_zh else "Avg Image Quality"}
+
'''
+ cards_html += '
'
+
+ # ── Missing slots (opportunities) ──
+ opp_title = "最大机会点" if is_zh else "Biggest Opportunities"
+ opp_html = f'{opp_title}
'
+ opp_html += ''
+ for slot, pct in top_missing:
+ label = f"{pct:.0f}% {'缺少' if is_zh else 'missing'} {slot}"
+ opp_html += f'''
+
{pct:.0f}%
+
{"缺少" if is_zh else "missing"} {slot}
+
'''
+ opp_html += '
'
+
+ # ── Image type comparison bars ──
+ comp_title = "图片类型对比" if is_zh else "Image Type Comparison"
+ bench_dist = b.get("image_type_distribution", {})
+ report_dist = report.image_type_distribution if report else {}
+
+ bars_html = f'{comp_title}
'
+ bars_html += f'''
+ {vs_label}
+ {bench_label}
+
'''
+
+ all_types = sorted(set(list(bench_dist.keys()) + list(report_dist.keys())))
+ for img_type in all_types:
+ yours = report_dist.get(img_type, 0)
+ theirs = bench_dist.get(img_type, 0)
+ delta = yours - theirs
+ delta_color = "#10B981" if delta > 0 else "#EF4444" if delta < 0 else "#6B7280"
+ delta_sign = "+" if delta > 0 else ""
+
+ bars_html += f'''
+
+
+
+
+
+
{theirs:.0f}%
+
{delta_sign}{delta:.0f}%
+
+
+
'''
+
+ # ── Price tier distribution ──
+ price_dist = b.get("price_tier_distribution", {})
+ price_html = ""
+ if price_dist:
+ price_title = "价格定位分布" if is_zh else "Price Tier Distribution"
+ price_html = f'{price_title}
'
+ price_html += ''
+ tier_colors = {"budget": "#10B981", "mid_range": "#2563EB", "premium": "#7C3AED", "luxury": "#D97706"}
+ for tier, pct in price_dist.items():
+ color = tier_colors.get(tier, "#6B7280")
+ price_html += f'''
+
{pct:.0f}%
+
{tier.replace("_", " ").title()}
+
'''
+ price_html += '
'
+
+ return f'''
+
+
+ 📊 {title}
+
+ {cards_html}
+ {opp_html}
+ {bars_html}
+ {price_html}
+
+ '''
+
+
def build_html_report(
report: "CategoryVisualReport",
products: list["ScrapedItem"],
analyses: list["ImageAnalysis"],
lifestyle_images: list[str],
language: Language = Language.en,
+ benchmark: object | None = None,
) -> str:
"""Build a self-contained, bilingual HTML report.
@@ -545,6 +665,7 @@ def build_html_report(
rec_section = _build_recommendations(report, lbl)
product_grid = _build_product_grid(report, products, lbl, platform)
lifestyle_section = _build_lifestyle_section(lifestyle_images, lbl)
+ benchmark_section = _build_benchmark_section(benchmark, report, lang) if benchmark else ""
css = _get_css()
@@ -587,6 +708,9 @@ def build_html_report(
{dist_section}
+
+ {benchmark_section}
+
{rec_section}
diff --git a/api/routes/benchmarks.py b/api/routes/benchmarks.py
new file mode 100644
index 0000000..9b9e3b7
--- /dev/null
+++ b/api/routes/benchmarks.py
@@ -0,0 +1,48 @@
+"""Benchmark API routes — category-level visual intelligence from 80K images."""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, Query
+
+from api.services.benchmark_service import (
+ get_benchmark,
+ get_categories,
+ get_reverse_prompts,
+ get_top_gaps,
+)
+
+router = APIRouter(prefix="/benchmarks", tags=["benchmarks"])
+
+
+@router.get("/categories")
+async def list_categories() -> list[dict]:
+ """List all available benchmark categories with product/image counts."""
+ return get_categories()
+
+
+@router.get("/{category}")
+async def category_benchmark(category: str) -> dict:
+ """Get full benchmark data for a category (matched via keyword)."""
+ bench = get_benchmark(category)
+ if bench is None:
+ return {"error": f"No benchmark data found for '{category}'"}
+ return bench.model_dump()
+
+
+@router.get("/{category}/top-gaps")
+async def category_top_gaps(
+ category: str,
+ limit: int = Query(default=5, ge=1, le=20),
+) -> list[dict]:
+ """Get the biggest opportunity gaps for a category."""
+ return get_top_gaps(category, limit=limit)
+
+
+@router.get("/{category}/reverse-prompts")
+async def category_reverse_prompts(
+ category: str,
+ type: str = Query(default="", description="Filter by image type (e.g. lifestyle, model)"),
+ limit: int = Query(default=10, ge=1, le=50),
+) -> list[dict]:
+ """Get reference reverse prompts for a category."""
+ return get_reverse_prompts(category, image_type=type, limit=limit)
diff --git a/api/routes/exports.py b/api/routes/exports.py
index f79382c..be1f1c1 100644
--- a/api/routes/exports.py
+++ b/api/routes/exports.py
@@ -81,6 +81,10 @@ async def export_job(job_id: str) -> StreamingResponse:
if hasattr(data, "listing_text") and data.listing_text:
zf.writestr("listing_text.json", json.dumps(data.listing_text, indent=2, ensure_ascii=False))
+ # Benchmark data
+ if hasattr(data, "benchmark") and data.benchmark:
+ zf.writestr("benchmark.json", json.dumps(data.benchmark, indent=2, ensure_ascii=False))
+
buf.seek(0)
keyword = ""
diff --git a/api/routes/jobs.py b/api/routes/jobs.py
index 4d392bc..935bc1f 100644
--- a/api/routes/jobs.py
+++ b/api/routes/jobs.py
@@ -80,6 +80,7 @@ async def get_job_data(job_id: str) -> dict:
"seasonal_alerts": data.seasonal_alerts,
"review_insights": data.review_insights,
"story_arc": data.story_arc,
+ "benchmark": data.benchmark,
}
diff --git a/api/routes/listing.py b/api/routes/listing.py
new file mode 100644
index 0000000..8d92646
--- /dev/null
+++ b/api/routes/listing.py
@@ -0,0 +1,78 @@
+"""Listing import routes: import pre-generated listing-gen projects."""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import shutil
+from pathlib import Path
+
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/listing", tags=["listing"])
+
+_REPORTS_DIR = Path(os.environ.get("REPORTS_DIR", "/tmp/superscrape_reports"))
+
+
+class ImportRequest(BaseModel):
+ project_dir: str
+
+
+@router.post("/jobs/{job_id}/import")
+async def import_listing_project(job_id: str, req: ImportRequest) -> dict:
+ """Import an existing listing-gen project into a superscrape job.
+
+ Reads brief.json, listing_text.json, and copies generated images
+ so they're accessible via the standard reports/static routes.
+ """
+ src = Path(req.project_dir)
+ if not src.exists():
+ raise HTTPException(status_code=404, detail=f"Project directory not found: {req.project_dir}")
+
+ output_dir = src / "output"
+
+ # Read brief.json
+ brief: dict = {}
+ brief_path = src / "brief.json"
+ if brief_path.exists():
+ brief = json.loads(brief_path.read_text(encoding="utf-8"))
+
+ # Read listing_text.json
+ listing_text: dict = {}
+ lt_path = output_dir / "listing_text.json"
+ if lt_path.exists():
+ listing_text = json.loads(lt_path.read_text(encoding="utf-8"))
+
+ # Read compliance_report.json
+ compliance: dict = {}
+ cr_path = output_dir / "compliance_report.json"
+ if cr_path.exists():
+ compliance = json.loads(cr_path.read_text(encoding="utf-8"))
+
+ # Copy generated images to REPORTS_DIR/{job_id}/images/
+ images_dest = _REPORTS_DIR / job_id / "images"
+ images_dest.mkdir(parents=True, exist_ok=True)
+
+ image_paths: list[dict[str, str]] = []
+ if output_dir.exists():
+ for img_file in sorted(output_dir.glob("slot_*.png")):
+ dst = images_dest / img_file.name
+ shutil.copy2(img_file, dst)
+ image_paths.append({
+ "filename": img_file.name,
+ "path": f"/reports/{job_id}/images/{img_file.name}",
+ })
+
+ logger.info("Imported listing project %s → job %s (%d images)", req.project_dir, job_id, len(image_paths))
+
+ return {
+ "job_id": job_id,
+ "brief": brief,
+ "listing_text": listing_text,
+ "compliance": compliance,
+ "images": image_paths,
+ "images_count": len(image_paths),
+ }
diff --git a/api/services/benchmark_service.py b/api/services/benchmark_service.py
new file mode 100644
index 0000000..494fded
--- /dev/null
+++ b/api/services/benchmark_service.py
@@ -0,0 +1,364 @@
+"""Category benchmark service — loads 80K pre-analyzed clothing image data.
+
+Provides category-level baselines (image types, angles, backgrounds, missing
+slots, A+ adoption, price tiers) so that per-job scrape results can be
+compared against the full dataset of 4,000+ products / 80,000+ images.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from collections import Counter, defaultdict
+from pathlib import Path
+
+from superscrape.output.models import BenchmarkData
+
+logger = logging.getLogger(__name__)
+
+_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "projects" / "clothing_images"
+
+# Pre-aggregated benchmarks keyed by L2 category name (e.g. "Active", "Jeans")
+_benchmarks: dict[str, BenchmarkData] = {}
+# Global (all-category) benchmark
+_global_benchmark: BenchmarkData | None = None
+_loaded = False
+
+
+def _pct(count: int, total: int) -> float:
+ """Return percentage rounded to 1 decimal, safe against zero division."""
+ return round(count / total * 100, 1) if total else 0.0
+
+
+def _load_data() -> None:
+ """Load and pre-aggregate the 80K dataset into per-category benchmarks."""
+ global _benchmarks, _global_benchmark, _loaded
+ if _loaded:
+ return
+
+ products_path = _DATA_DIR / "all_products.json"
+ images_path = _DATA_DIR / "image_analysis.jsonl"
+ product_analysis_path = _DATA_DIR / "product_analysis.jsonl"
+ aplus_path = _DATA_DIR / "aplus_analysis.jsonl"
+
+ if not products_path.exists():
+ logger.warning("Benchmark data not found at %s — skipping", _DATA_DIR)
+ _loaded = True
+ return
+
+ # ── Step 1: Load product metadata (ASIN → category mapping) ──────────
+ logger.info("Loading benchmark products from %s ...", products_path)
+ with open(products_path) as f:
+ raw_products = json.load(f)
+
+ asin_to_cat: dict[str, str] = {} # asin → L2 category
+ asin_to_catpath: dict[str, str] = {}
+ cat_product_counts: Counter[str] = Counter()
+
+ for p in raw_products:
+ asin = p.get("asin", "")
+ cat_path = p.get("category_path", "")
+ parts = cat_path.split(" > ")
+ l2 = parts[1] if len(parts) >= 2 else "Other"
+ asin_to_cat[asin] = l2
+ asin_to_catpath[asin] = cat_path
+ cat_product_counts[l2] += 1
+
+ total_products = len(raw_products)
+ logger.info("Loaded %d products across %d categories", total_products, len(cat_product_counts))
+
+ # ── Step 2: Aggregate image analysis per category ────────────────────
+ # Per-category accumulators
+ cat_img_count: Counter[str] = Counter()
+ cat_img_type: dict[str, Counter] = defaultdict(Counter)
+ cat_angle: dict[str, Counter] = defaultdict(Counter)
+ cat_bg: dict[str, Counter] = defaultdict(Counter)
+ cat_has_person: Counter[str] = Counter() # images with model category
+ cat_has_text: Counter[str] = Counter()
+ cat_quality_sum: dict[str, float] = defaultdict(float)
+ cat_style_tags: dict[str, Counter] = defaultdict(Counter)
+ cat_reverse_prompts: dict[str, list] = defaultdict(list)
+
+ if images_path.exists():
+ logger.info("Loading image analyses from %s ...", images_path)
+ with open(images_path) as f:
+ for line in f:
+ try:
+ img = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+
+ asin = img.get("asin", "")
+ cat = asin_to_cat.get(asin, "Other")
+
+ cat_img_count[cat] += 1
+ cat_img_count["__all__"] += 1
+
+ # Image type (category field in image_analysis = image type)
+ img_type = img.get("category", "unknown")
+ cat_img_type[cat][img_type] += 1
+ cat_img_type["__all__"][img_type] += 1
+
+ # Camera angle
+ comp = img.get("composition", {})
+ angle = comp.get("camera_angle", "unknown")
+ cat_angle[cat][angle] += 1
+ cat_angle["__all__"][angle] += 1
+
+ # Background
+ bg = comp.get("background", "unknown")
+ cat_bg[cat][bg] += 1
+ cat_bg["__all__"][bg] += 1
+
+ # Person/model detection (image type == "model")
+ if img_type == "model":
+ cat_has_person[cat] += 1
+ cat_has_person["__all__"] += 1
+
+ # Text overlay
+ text_info = img.get("text_overlay", {})
+ if text_info.get("has_text"):
+ cat_has_text[cat] += 1
+ cat_has_text["__all__"] += 1
+
+ # Quality score
+ qs = img.get("quality_score", 0)
+ cat_quality_sum[cat] += qs
+ cat_quality_sum["__all__"] += qs
+
+ # Style tags
+ for tag in img.get("style_tags", []):
+ cat_style_tags[cat][tag] += 1
+ cat_style_tags["__all__"][tag] += 1
+
+ # Reverse prompts (keep top ones per category, limit memory)
+ rp = img.get("reverse_prompt", "")
+ if rp and len(cat_reverse_prompts[cat]) < 50:
+ cat_reverse_prompts[cat].append({
+ "prompt": rp,
+ "type": img_type,
+ "url": img.get("url", ""),
+ })
+
+ total_images = cat_img_count["__all__"]
+ logger.info("Loaded %d image analyses", total_images)
+ else:
+ total_images = 0
+
+ # ── Step 3: Load product analysis (missing slots, A+, price tiers) ───
+ cat_missing: dict[str, Counter] = defaultdict(Counter)
+ cat_aplus_has: Counter[str] = Counter()
+ cat_aplus_score_sum: dict[str, float] = defaultdict(float)
+ cat_aplus_count: Counter[str] = Counter()
+ cat_price_tier: dict[str, Counter] = defaultdict(Counter)
+ cat_analysis_count: Counter[str] = Counter()
+
+ if product_analysis_path.exists():
+ logger.info("Loading product analyses from %s ...", product_analysis_path)
+ with open(product_analysis_path) as f:
+ for line in f:
+ try:
+ pa = json.loads(line)
+ except json.JSONDecodeError:
+ continue
+
+ asin = pa.get("asin", "")
+ cat = asin_to_cat.get(asin, "Other")
+ cat_analysis_count[cat] += 1
+ cat_analysis_count["__all__"] += 1
+
+ # Missing slots
+ missing = pa.get("main_sequence", {}).get("missing_slots", [])
+ for slot in missing:
+ cat_missing[cat][slot] += 1
+ cat_missing["__all__"][slot] += 1
+
+ # A+ content
+ aplus = pa.get("aplus_content", {})
+ if aplus.get("has_aplus"):
+ cat_aplus_has[cat] += 1
+ cat_aplus_has["__all__"] += 1
+ score = aplus.get("aplus_score", 0)
+ if score:
+ cat_aplus_score_sum[cat] += score
+ cat_aplus_score_sum["__all__"] += score
+ cat_aplus_count[cat] += 1
+ cat_aplus_count["__all__"] += 1
+
+ # Price tier
+ tier = pa.get("brand_positioning", {}).get("price_tier", "unknown")
+ cat_price_tier[cat][tier] += 1
+ cat_price_tier["__all__"][tier] += 1
+
+ # ── Step 4: Build BenchmarkData per category ─────────────────────────
+ def _build_benchmark(cat_key: str, display_name: str) -> BenchmarkData:
+ n_imgs = cat_img_count.get(cat_key, 0)
+ n_prods = cat_product_counts.get(cat_key, 0) if cat_key != "__all__" else total_products
+ n_analysis = cat_analysis_count.get(cat_key, 0)
+
+ # Convert counters to percentage dicts
+ def _counter_to_pct(counter: Counter, total: int) -> dict[str, float]:
+ return {k: _pct(v, total) for k, v in counter.most_common()}
+
+ # Missing slots as percentage of analyzed products
+ missing_pct = {k: _pct(v, n_analysis) for k, v in cat_missing.get(cat_key, Counter()).most_common()}
+
+ # Top style tags
+ top_tags = [tag for tag, _ in cat_style_tags.get(cat_key, Counter()).most_common(10)]
+
+ # Quality avg
+ q_avg = round(cat_quality_sum.get(cat_key, 0) / n_imgs, 2) if n_imgs else 0.0
+
+ # A+ stats
+ aplus_rate = _pct(cat_aplus_has.get(cat_key, 0), n_analysis) if n_analysis else 0.0
+ aplus_avg = round(
+ cat_aplus_score_sum.get(cat_key, 0) / cat_aplus_count.get(cat_key, 1), 1
+ ) if cat_aplus_count.get(cat_key, 0) else 0.0
+
+ return BenchmarkData(
+ category=display_name,
+ matched_categories=[cat_key] if cat_key != "__all__" else list(cat_product_counts.keys()),
+ total_products=n_prods,
+ total_images=n_imgs,
+ image_type_distribution=_counter_to_pct(cat_img_type.get(cat_key, Counter()), n_imgs),
+ angle_distribution=_counter_to_pct(cat_angle.get(cat_key, Counter()), n_imgs),
+ background_distribution=_counter_to_pct(cat_bg.get(cat_key, Counter()), n_imgs),
+ has_person_ratio=_pct(cat_has_person.get(cat_key, 0), n_imgs),
+ has_text_ratio=_pct(cat_has_text.get(cat_key, 0), n_imgs),
+ top_style_tags=top_tags,
+ quality_avg=q_avg,
+ missing_slots_ranking=missing_pct,
+ aplus_adoption_rate=aplus_rate,
+ aplus_avg_score=aplus_avg,
+ price_tier_distribution=_counter_to_pct(cat_price_tier.get(cat_key, Counter()), n_analysis),
+ top_reverse_prompts=cat_reverse_prompts.get(cat_key, [])[:10],
+ )
+
+ # Build per-category benchmarks
+ for cat_name in cat_product_counts:
+ _benchmarks[cat_name] = _build_benchmark(cat_name, cat_name)
+
+ # Build global benchmark
+ _global_benchmark = _build_benchmark("__all__", "All Clothing")
+ _loaded = True
+
+ logger.info(
+ "Benchmark service ready: %d categories, %d products, %d images",
+ len(_benchmarks), total_products, total_images,
+ )
+
+
+def get_categories() -> list[dict[str, int]]:
+ """Return list of available benchmark categories with product counts."""
+ _load_data()
+ return [
+ {"category": b.category, "total_products": b.total_products, "total_images": b.total_images}
+ for b in sorted(_benchmarks.values(), key=lambda b: b.total_products, reverse=True)
+ ]
+
+
+def get_benchmark(keyword: str) -> BenchmarkData | None:
+ """Match a search keyword to the most relevant category benchmark.
+
+ Matching strategy:
+ 1. Exact L2 category name match (case-insensitive)
+ 2. Keyword substring match against category names
+ 3. Fall back to global (all-category) benchmark
+ """
+ _load_data()
+ if not _benchmarks:
+ return None
+
+ kw_lower = keyword.lower()
+
+ # Exact match
+ for cat_name, bench in _benchmarks.items():
+ if cat_name.lower() == kw_lower:
+ return bench
+
+ # Substring / keyword match — score each category
+ kw_words = set(kw_lower.split())
+ best_score = 0
+ best_cat = None
+
+ _KEYWORD_MAP: dict[str, list[str]] = {
+ "Active": ["active", "activewear", "sport", "athletic", "workout", "gym", "running", "yoga", "legging", "shorts"],
+ "Jeans": ["jean", "jeans", "denim"],
+ "Fashion Hoodies & Sweatshirts": ["hoodie", "hoodies", "sweatshirt", "pullover", "fleece"],
+ "Baby Boys": ["baby", "boy", "infant", "toddler", "onesie"],
+ "Baby Girls": ["baby", "girl", "infant", "toddler"],
+ "Bodysuits": ["bodysuit", "romper", "jumpsuit"],
+ "Jackets & Coats": ["jacket", "coat", "parka", "windbreaker", "bomber", "puffer"],
+ "Underwear": ["underwear", "boxer", "brief", "panties"],
+ "Dresses": ["dress", "gown", "sundress", "maxi", "midi"],
+ "Socks & Hosiery": ["sock", "socks", "hosiery", "stocking"],
+ "Clothing Sets": ["set", "outfit", "matching"],
+ "Socks & Tights": ["tights", "leotard"],
+ "Swim": ["swim", "swimsuit", "swimwear", "bikini", "trunk"],
+ "Lingerie, Sleep & Lounge": ["lingerie", "pajama", "sleepwear", "lounge", "nightgown", "robe"],
+ "Suits & Sport Coats": ["suit", "blazer", "sport coat", "formal"],
+ "Coats, Jackets & Vests": ["vest", "gilet", "waistcoat"],
+ "Swimsuits & Cover Ups": ["cover up", "sarong", "beach"],
+ "Suiting & Blazers": ["suiting", "blazer"],
+ "Overalls": ["overall", "dungaree", "bib"],
+ }
+
+ for cat_name, bench in _benchmarks.items():
+ score = 0
+ cat_words = set(cat_name.lower().split())
+
+ # Direct word overlap
+ overlap = kw_words & cat_words
+ score += len(overlap) * 3
+
+ # Keyword map matching
+ mapped_words = _KEYWORD_MAP.get(cat_name, [])
+ for w in kw_words:
+ if w in mapped_words:
+ score += 2
+ # Partial match (e.g. "shirt" in "sweatshirt")
+ for mw in mapped_words:
+ if w in mw or mw in w:
+ score += 1
+
+ # Substring of category name
+ if kw_lower in cat_name.lower():
+ score += 3
+
+ if score > best_score:
+ best_score = score
+ best_cat = cat_name
+
+ if best_cat and best_score >= 2:
+ return _benchmarks[best_cat]
+
+ # Fall back to global benchmark for weak/no matches
+ return _global_benchmark
+
+
+def get_top_gaps(keyword: str, limit: int = 5) -> list[dict]:
+ """Return the top opportunity gaps for a category."""
+ bench = get_benchmark(keyword)
+ if not bench:
+ return []
+
+ gaps: list[dict] = []
+ for slot, pct in list(bench.missing_slots_ranking.items())[:limit]:
+ gaps.append({
+ "slot": slot,
+ "missing_pct": pct,
+ "insight": f"{pct}% of products in this category are missing '{slot}' — adding one puts you ahead of the majority.",
+ })
+ return gaps
+
+
+def get_reverse_prompts(keyword: str, image_type: str = "", limit: int = 10) -> list[dict]:
+ """Return reference reverse prompts for a category, optionally filtered by image type."""
+ bench = get_benchmark(keyword)
+ if not bench:
+ return []
+
+ prompts = bench.top_reverse_prompts
+ if image_type:
+ prompts = [p for p in prompts if p.get("type", "") == image_type]
+ return prompts[:limit]
diff --git a/api/services/job_runner.py b/api/services/job_runner.py
index 61ab6f9..8179730 100644
--- a/api/services/job_runner.py
+++ b/api/services/job_runner.py
@@ -121,6 +121,20 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None:
platform=pval,
)
+ # ── Step 3.5: Load category benchmark ────────────────────────────
+ benchmark = None
+ try:
+ from api.services.benchmark_service import get_benchmark
+ benchmark = get_benchmark(keyword)
+ if benchmark:
+ _emit(
+ job_id, PipelineStep.analyzing,
+ f"Loaded category benchmark ({benchmark.total_products} products, {benchmark.total_images} images)",
+ 73, platform=pval,
+ )
+ except Exception as exc:
+ logger.warning("Benchmark loading failed (non-fatal): %s", exc)
+
# ── Step 4: Report ─────────────────────────────────────────────────
_emit(job_id, PipelineStep.building_report, "Building category report...", 88, platform=pval)
report = aggregate_report(keyword, products, analyses, platform=platform.value)
@@ -141,6 +155,7 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None:
analyses=analyses,
lifestyle_images=lifestyle_images,
language=req.language,
+ benchmark=benchmark,
)
html_path.write_text(html_content, encoding="utf-8")
@@ -167,10 +182,10 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None:
from api.services.strategy_gen import generate_ab_tests, generate_action_plan
review_insights = analyze_reviews(products)
- action_plan = generate_action_plan(report, products)
- ab_tests = generate_ab_tests(report, products)
+ action_plan = generate_action_plan(report, products, benchmark=benchmark)
+ ab_tests = generate_ab_tests(report, products, benchmark=benchmark)
seasonal_alerts = get_seasonal_alerts(keyword)
- story_arc = design_story_arc(report, products, review_insights)
+ story_arc = design_story_arc(report, products, review_insights, benchmark=benchmark)
logger.info(
"Strategy gen done: %d phases, %d tests, %d alerts, %d pain points, story arc=%s",
len(action_plan), len(ab_tests), len(seasonal_alerts),
@@ -193,6 +208,7 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None:
seasonal_alerts=seasonal_alerts,
review_insights=review_insights,
story_arc=story_arc,
+ benchmark=benchmark.model_dump() if benchmark else None,
)
report_url = f"/reports/{job_id}/html"
diff --git a/api/services/job_store.py b/api/services/job_store.py
index 06ddd65..a25048b 100644
--- a/api/services/job_store.py
+++ b/api/services/job_store.py
@@ -30,7 +30,7 @@
class JobData:
"""Container for pipeline output data (products, analyses, report, strategy)."""
- __slots__ = ("products", "analyses", "report", "action_plan", "ab_tests", "seasonal_alerts", "review_insights", "story_arc")
+ __slots__ = ("products", "analyses", "report", "action_plan", "ab_tests", "seasonal_alerts", "review_insights", "story_arc", "benchmark")
def __init__(
self,
@@ -42,6 +42,7 @@ def __init__(
seasonal_alerts: list | None = None,
review_insights: dict | None = None,
story_arc: dict | None = None,
+ benchmark: dict | None = None,
) -> None:
self.products = products
self.analyses = analyses
@@ -51,6 +52,7 @@ def __init__(
self.seasonal_alerts = seasonal_alerts or []
self.review_insights = review_insights or {}
self.story_arc = story_arc or {}
+ self.benchmark = benchmark or {}
def _serialize_report(report: object) -> dict | None:
@@ -106,6 +108,7 @@ def _load_persisted(self) -> None:
seasonal_alerts=data_raw.get("seasonal_alerts", []),
review_insights=data_raw.get("review_insights", {}),
story_arc=data_raw.get("story_arc", {}),
+ benchmark=data_raw.get("benchmark", {}),
)
loaded += 1
except Exception as e:
@@ -153,6 +156,7 @@ def _persist_data(self, job_id: str) -> None:
"seasonal_alerts": data.seasonal_alerts,
"review_insights": data.review_insights,
"story_arc": data.story_arc,
+ "benchmark": data.benchmark,
}
path = data_dir / f"{job_id}.json"
@@ -266,6 +270,7 @@ def save_data(
seasonal_alerts: list | None = None,
review_insights: dict | None = None,
story_arc: dict | None = None,
+ benchmark: dict | None = None,
) -> None:
"""Store structured pipeline output for later retrieval via /data endpoint."""
with self._lock:
@@ -278,6 +283,7 @@ def save_data(
seasonal_alerts=seasonal_alerts,
review_insights=review_insights,
story_arc=story_arc,
+ benchmark=benchmark,
)
self._persist_data(job_id)
diff --git a/api/services/listing_service.py b/api/services/listing_service.py
index f1ddb72..bac9597 100644
--- a/api/services/listing_service.py
+++ b/api/services/listing_service.py
@@ -20,13 +20,14 @@
from superscrape.listing_gen.category_config import get_category_config
from superscrape.listing_gen.models import (
BrandConfig,
+ ColorVariant,
ImageSlotSpec,
ProjectConfig,
SceneRequirement,
SlotMethod,
SlotType,
)
-from superscrape.output.models import CategoryVisualReport, ScrapedItem
+from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem
logger = logging.getLogger(__name__)
@@ -226,6 +227,7 @@ def auto_generate_config(
name=brand_name,
colors=brand_colors or [],
),
+ color_variants=[ColorVariant(name="default", photos=photo_map)] if photo_map else [],
main_images=slots,
scenes=scenes,
size_data=None, # Can be enhanced later
@@ -282,6 +284,7 @@ def generate_listing_text(
products: list[ScrapedItem],
report: CategoryVisualReport,
brand_name: str = "",
+ benchmark: BenchmarkData | None = None,
) -> dict:
"""Generate Amazon listing text (title, bullets, description) using GPT-5.2.
@@ -335,6 +338,15 @@ def generate_listing_text(
user_parts.append(f"\nPrice range: {price_range}")
user_parts.append(f"\nHigh-frequency keywords: {', '.join(common_keywords[:15])}")
user_parts.append(f"\nVisual insights: {report.has_person_ratio}% show models, {report.has_text_ratio}% use text overlays")
+
+ if benchmark and benchmark.total_products > 0:
+ user_parts.append(f"\nCategory benchmark ({benchmark.total_products:,} products):")
+ if benchmark.top_style_tags:
+ user_parts.append(f" Dominant styles: {', '.join(benchmark.top_style_tags[:6])}")
+ if benchmark.price_tier_distribution:
+ tiers = [f"{t}: {p}%" for t, p in list(benchmark.price_tier_distribution.items())[:3]]
+ user_parts.append(f" Price tiers: {', '.join(tiers)}")
+
user_parts.append(f"\nGenerate the complete Amazon listing text. Return ONLY JSON.")
try:
diff --git a/api/services/story_arc.py b/api/services/story_arc.py
index f22a9bd..9e265cb 100644
--- a/api/services/story_arc.py
+++ b/api/services/story_arc.py
@@ -12,7 +12,7 @@
from openai import OpenAI
-from superscrape.output.models import CategoryVisualReport, ScrapedItem
+from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem
logger = logging.getLogger(__name__)
@@ -66,6 +66,7 @@ def design_story_arc(
report: CategoryVisualReport,
products: list[ScrapedItem],
review_insights: dict | None = None,
+ benchmark: BenchmarkData | None = None,
) -> dict:
"""Design a 7-image story arc based on competitive intelligence.
@@ -119,6 +120,19 @@ def design_story_arc(
for p in products[:5]:
parts.append(f" - {p.title[:70]}... | {p.price} | {p.rating}★")
+ # Category benchmark (80K image dataset)
+ if benchmark and benchmark.total_products > 0:
+ parts.append(f"\n--- CATEGORY BENCHMARK ({benchmark.total_products:,} products / {benchmark.total_images:,} images) ---")
+ if benchmark.missing_slots_ranking:
+ parts.append("Most commonly missing image slots:")
+ for slot, pct in list(benchmark.missing_slots_ranking.items())[:5]:
+ parts.append(f" {slot}: {pct}% of products are missing this")
+ parts.append(f"A+ adoption: {benchmark.aplus_adoption_rate}%, avg score: {benchmark.aplus_avg_score}/9")
+ parts.append(f"Quality avg: {benchmark.quality_avg}/5")
+ if benchmark.top_style_tags:
+ parts.append(f"Category style: {', '.join(benchmark.top_style_tags[:6])}")
+ parts.append("IMPORTANT: Use missing slot data to prioritize which images to include. If 87% are missing size_chart, recommend it.")
+
context = "\n".join(parts)
try:
diff --git a/api/services/strategy_gen.py b/api/services/strategy_gen.py
index bf720e6..ce602b0 100644
--- a/api/services/strategy_gen.py
+++ b/api/services/strategy_gen.py
@@ -8,7 +8,7 @@
from openai import OpenAI
-from superscrape.output.models import CategoryVisualReport, ScrapedItem
+from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem
logger = logging.getLogger(__name__)
@@ -42,6 +42,7 @@ def _get_client() -> OpenAI:
def _build_context(
report: CategoryVisualReport,
products: list[ScrapedItem],
+ benchmark: BenchmarkData | None = None,
) -> str:
"""Build a concise context string from the report data."""
parts: list[str] = []
@@ -81,15 +82,42 @@ def _build_context(
for rec in report.recommendations:
parts.append(f" - {rec}")
+ # Category benchmark from 80K image dataset
+ if benchmark and benchmark.total_products > 0:
+ parts.append(f"\n--- CATEGORY BENCHMARK (from {benchmark.total_products:,} products / {benchmark.total_images:,} images) ---")
+ parts.append(f"Category: {benchmark.category}")
+
+ if benchmark.missing_slots_ranking:
+ parts.append("\nMost commonly missing image slots (% of products missing it):")
+ for slot, pct in list(benchmark.missing_slots_ranking.items())[:5]:
+ parts.append(f" {slot}: {pct}% missing")
+
+ parts.append(f"\nBenchmark quality average: {benchmark.quality_avg}/5")
+ parts.append(f"A+ Content adoption: {benchmark.aplus_adoption_rate}%")
+ if benchmark.aplus_avg_score:
+ parts.append(f"A+ average score: {benchmark.aplus_avg_score}/9")
+
+ if benchmark.price_tier_distribution:
+ parts.append("\nPrice tier distribution:")
+ for tier, pct in benchmark.price_tier_distribution.items():
+ parts.append(f" {tier}: {pct}%")
+
+ if benchmark.top_style_tags:
+ parts.append(f"\nTop style tags: {', '.join(benchmark.top_style_tags[:8])}")
+
+ parts.append("\nUSE THIS BENCHMARK DATA to make specific, data-backed recommendations.")
+ parts.append("Reference exact percentages and compare the user's competitors against the full category.")
+
return "\n".join(parts)
def generate_action_plan(
report: CategoryVisualReport,
products: list[ScrapedItem],
+ benchmark: BenchmarkData | None = None,
) -> list[dict]:
"""Generate a dynamic action plan based on competitive intelligence."""
- context = _build_context(report, products)
+ context = _build_context(report, products, benchmark=benchmark)
try:
resp = _get_client().chat.completions.create(
@@ -124,9 +152,10 @@ def generate_action_plan(
def generate_ab_tests(
report: CategoryVisualReport,
products: list[ScrapedItem],
+ benchmark: BenchmarkData | None = None,
) -> list[dict]:
"""Generate A/B test suggestions based on competitive intelligence."""
- context = _build_context(report, products)
+ context = _build_context(report, products, benchmark=benchmark)
try:
resp = _get_client().chat.completions.create(
diff --git a/docker-compose.yml b/docker-compose.yml
index 22d8dd0..8d7ec7e 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -10,6 +10,8 @@ services:
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY}
- BYTEPLUSES_API_KEY=${BYTEPLUSES_API_KEY:-}
+ - PROXY_URL=${PROXY_URL:-}
+ - CAMOUFOX_PATH=${CAMOUFOX_PATH:-}
- REPORTS_DIR=/app/reports
volumes:
- reports:/app/reports
diff --git a/frontend/app/api/proxy/[...path]/route.ts b/frontend/app/api/proxy/[...path]/route.ts
index 96fb676..ace2162 100644
--- a/frontend/app/api/proxy/[...path]/route.ts
+++ b/frontend/app/api/proxy/[...path]/route.ts
@@ -87,15 +87,22 @@ export async function POST(request: NextRequest, context: RouteContext) {
const url = buildBackendUrl(pathParts, request.nextUrl.searchParams);
let body: BodyInit | null = null;
+ const headers: Record = {};
const contentType = request.headers.get("content-type") ?? "";
- if (contentType.includes("application/json")) {
+
+ if (contentType.includes("multipart/form-data")) {
+ // Stream multipart body directly to preserve boundary
+ body = request.body;
+ headers["content-type"] = contentType;
+ } else if (contentType.includes("application/json")) {
body = await request.text();
+ headers["content-type"] = "application/json";
}
try {
const backendRes = await fetch(url, {
method: "POST",
- headers: { "Content-Type": "application/json" },
+ headers,
body: body ?? undefined,
cache: "no-store",
});
diff --git a/frontend/components/DataDashboard.tsx b/frontend/components/DataDashboard.tsx
index 332e3ac..03263f8 100644
--- a/frontend/components/DataDashboard.tsx
+++ b/frontend/components/DataDashboard.tsx
@@ -75,6 +75,23 @@ interface SeasonalAlert {
days_until: number;
}
+interface BenchmarkData {
+ category: string;
+ total_products: number;
+ total_images: number;
+ image_type_distribution: Record;
+ angle_distribution: Record;
+ background_distribution: Record;
+ has_person_ratio: number;
+ has_text_ratio: number;
+ top_style_tags: string[];
+ quality_avg: number;
+ missing_slots_ranking: Record;
+ aplus_adoption_rate: number;
+ aplus_avg_score: number;
+ price_tier_distribution: Record;
+}
+
interface JobData {
products: Array<{ title: string; price: string; rating: number; reviews_count: number; reviews?: Array<{ text: string; stars: number }> }>;
analyses: Array<{ image_type: string; selling_point_angle: string; info_hierarchy: string; text_coverage_pct: number }>;
@@ -84,12 +101,14 @@ interface JobData {
seasonal_alerts: SeasonalAlert[];
review_insights: ReviewInsights;
story_arc: StoryArc;
+ benchmark: BenchmarkData | null;
}
-type Tab = "overview" | "reviews" | "story_arc" | "action_plan" | "ab_tests";
+type Tab = "overview" | "benchmark" | "reviews" | "story_arc" | "action_plan" | "ab_tests";
const TAB_LABELS: Record = {
overview: "Overview",
+ benchmark: "Benchmark",
reviews: "Review Insights",
story_arc: "7-Image Strategy",
action_plan: "Action Plan",
@@ -166,7 +185,9 @@ export default function DataDashboard({ jobId }: { jobId: string }) {
{/* Tab navigation */}
- {(Object.keys(TAB_LABELS) as Tab[]).map((t) => (
+ {(Object.keys(TAB_LABELS) as Tab[]).filter((t) =>
+ t !== "benchmark" || (data.benchmark && data.benchmark.total_products > 0)
+ ).map((t) => (