diff --git a/Dockerfile b/Dockerfile index 5ec52ae..5e16884 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,16 +32,18 @@ COPY superscrape/ superscrape/ COPY api/ api/ # Install Python deps (non-editable for Docker) -RUN pip install --no-cache-dir . +RUN pip install --no-cache-dir ".[api]" -# Download Camoufox browser binary -RUN python -c "from camoufox.sync_api import Camoufox; print('Camoufox binary ready')" || true +# Download Camoufox browser binary (FF135 from daijro/camoufox releases) +# Also install Playwright system deps for the bundled Firefox +RUN python -m camoufox fetch && \ + python -c "import camoufox, glob, os; d=os.path.dirname(camoufox.__file__); print('Binary:', glob.glob(f'{d}/**/firefox*', recursive=True)[:3])" EXPOSE 8001 HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \ CMD curl -f http://localhost:8001/health || exit 1 -# xvfb-run provides virtual display for headless Camoufox -CMD xvfb-run --auto-servernum --server-args="-screen 0 1280x720x24" \ - uvicorn api.main:app --host 0.0.0.0 --port 8001 +# Camoufox headless='virtual' manages its own Xvfb internally. +# Do NOT wrap with xvfb-run — dual Xvfb causes display conflicts. +CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8001"] diff --git a/api/main.py b/api/main.py index d75fe71..6eb66e1 100644 --- a/api/main.py +++ b/api/main.py @@ -14,8 +14,10 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from api.routes.benchmarks import router as benchmarks_router from api.routes.exports import router as exports_router from api.routes.jobs import router as jobs_router +from api.routes.listing import router as listing_router from api.routes.monitor import router as monitor_router from api.routes.reports import router as reports_router from api.routes.uploads import router as uploads_router @@ -61,10 +63,12 @@ async def lifespan(app: FastAPI) -> AsyncIterator[None]: # ── Routers ─────────────────────────────────────────────────────────────────── app.include_router(jobs_router) +app.include_router(listing_router) app.include_router(reports_router) app.include_router(uploads_router) app.include_router(exports_router) app.include_router(monitor_router) +app.include_router(benchmarks_router) @app.get("/health", tags=["meta"]) diff --git a/api/reports/html_builder.py b/api/reports/html_builder.py index 643be24..7e28422 100644 --- a/api/reports/html_builder.py +++ b/api/reports/html_builder.py @@ -514,12 +514,132 @@ def _build_lifestyle_section( # ── Main Builder ────────────────────────────────────────────────────────────── +def _build_benchmark_section( + benchmark: object, + report: "CategoryVisualReport", + lang: str = "en", +) -> str: + """Build the Category Benchmark comparison section.""" + b = benchmark if isinstance(benchmark, dict) else (benchmark.model_dump() if hasattr(benchmark, "model_dump") else {}) + if not b or not b.get("total_products"): + return "" + + is_zh = lang == "zh" + title = "品类基准线 (80K 图片数据)" if is_zh else "Category Benchmark (80K Image Dataset)" + vs_label = "你的竞品" if is_zh else "Your Competitors" + bench_label = "品类基准" if is_zh else "Category Benchmark" + + # ── Killer stat cards ── + missing_slots = b.get("missing_slots_ranking", {}) + top_missing = list(missing_slots.items())[:3] + aplus_rate = b.get("aplus_adoption_rate", 0) + aplus_score = b.get("aplus_avg_score", 0) + quality_avg = b.get("quality_avg", 0) + + cards_html = '
' + cards_html += f'''
+
{b.get("total_products", 0):,}
+
{"基准产品数" if is_zh else "Benchmark Products"}
+
''' + cards_html += f'''
+
{b.get("total_images", 0):,}
+
{"基准图片数" if is_zh else "Benchmark Images"}
+
''' + cards_html += f'''
+
{aplus_rate:.0f}%
+
{"有A+内容" if is_zh else "Have A+ Content"}
+
''' + cards_html += f'''
+
{quality_avg:.1f}/5
+
{"平均图片质量" if is_zh else "Avg Image Quality"}
+
''' + cards_html += '
' + + # ── Missing slots (opportunities) ── + opp_title = "最大机会点" if is_zh else "Biggest Opportunities" + opp_html = f'

{opp_title}

' + opp_html += '
' + for slot, pct in top_missing: + label = f"{pct:.0f}% {'缺少' if is_zh else 'missing'} {slot}" + opp_html += f'''
+
{pct:.0f}%
+
{"缺少" if is_zh else "missing"} {slot}
+
''' + opp_html += '
' + + # ── Image type comparison bars ── + comp_title = "图片类型对比" if is_zh else "Image Type Comparison" + bench_dist = b.get("image_type_distribution", {}) + report_dist = report.image_type_distribution if report else {} + + bars_html = f'

{comp_title}

' + bars_html += f'''
+ {vs_label} + {bench_label} +
''' + + all_types = sorted(set(list(bench_dist.keys()) + list(report_dist.keys()))) + for img_type in all_types: + yours = report_dist.get(img_type, 0) + theirs = bench_dist.get(img_type, 0) + delta = yours - theirs + delta_color = "#10B981" if delta > 0 else "#EF4444" if delta < 0 else "#6B7280" + delta_sign = "+" if delta > 0 else "" + + bars_html += f'''
+
+ {img_type} +
+
+ {yours:.0f}% +
+
+
+ +
+
+ {theirs:.0f}% + {delta_sign}{delta:.0f}% +
+
+
''' + + # ── Price tier distribution ── + price_dist = b.get("price_tier_distribution", {}) + price_html = "" + if price_dist: + price_title = "价格定位分布" if is_zh else "Price Tier Distribution" + price_html = f'

{price_title}

' + price_html += '
' + tier_colors = {"budget": "#10B981", "mid_range": "#2563EB", "premium": "#7C3AED", "luxury": "#D97706"} + for tier, pct in price_dist.items(): + color = tier_colors.get(tier, "#6B7280") + price_html += f'''
+
{pct:.0f}%
+
{tier.replace("_", " ").title()}
+
''' + price_html += '
' + + return f''' +
+

+ 📊 {title} +

+ {cards_html} + {opp_html} + {bars_html} + {price_html} +
+ ''' + + def build_html_report( report: "CategoryVisualReport", products: list["ScrapedItem"], analyses: list["ImageAnalysis"], lifestyle_images: list[str], language: Language = Language.en, + benchmark: object | None = None, ) -> str: """Build a self-contained, bilingual HTML report. @@ -545,6 +665,7 @@ def build_html_report( rec_section = _build_recommendations(report, lbl) product_grid = _build_product_grid(report, products, lbl, platform) lifestyle_section = _build_lifestyle_section(lifestyle_images, lbl) + benchmark_section = _build_benchmark_section(benchmark, report, lang) if benchmark else "" css = _get_css() @@ -587,6 +708,9 @@ def build_html_report( {dist_section} + + {benchmark_section} + {rec_section} diff --git a/api/routes/benchmarks.py b/api/routes/benchmarks.py new file mode 100644 index 0000000..9b9e3b7 --- /dev/null +++ b/api/routes/benchmarks.py @@ -0,0 +1,48 @@ +"""Benchmark API routes — category-level visual intelligence from 80K images.""" + +from __future__ import annotations + +from fastapi import APIRouter, Query + +from api.services.benchmark_service import ( + get_benchmark, + get_categories, + get_reverse_prompts, + get_top_gaps, +) + +router = APIRouter(prefix="/benchmarks", tags=["benchmarks"]) + + +@router.get("/categories") +async def list_categories() -> list[dict]: + """List all available benchmark categories with product/image counts.""" + return get_categories() + + +@router.get("/{category}") +async def category_benchmark(category: str) -> dict: + """Get full benchmark data for a category (matched via keyword).""" + bench = get_benchmark(category) + if bench is None: + return {"error": f"No benchmark data found for '{category}'"} + return bench.model_dump() + + +@router.get("/{category}/top-gaps") +async def category_top_gaps( + category: str, + limit: int = Query(default=5, ge=1, le=20), +) -> list[dict]: + """Get the biggest opportunity gaps for a category.""" + return get_top_gaps(category, limit=limit) + + +@router.get("/{category}/reverse-prompts") +async def category_reverse_prompts( + category: str, + type: str = Query(default="", description="Filter by image type (e.g. lifestyle, model)"), + limit: int = Query(default=10, ge=1, le=50), +) -> list[dict]: + """Get reference reverse prompts for a category.""" + return get_reverse_prompts(category, image_type=type, limit=limit) diff --git a/api/routes/exports.py b/api/routes/exports.py index f79382c..be1f1c1 100644 --- a/api/routes/exports.py +++ b/api/routes/exports.py @@ -81,6 +81,10 @@ async def export_job(job_id: str) -> StreamingResponse: if hasattr(data, "listing_text") and data.listing_text: zf.writestr("listing_text.json", json.dumps(data.listing_text, indent=2, ensure_ascii=False)) + # Benchmark data + if hasattr(data, "benchmark") and data.benchmark: + zf.writestr("benchmark.json", json.dumps(data.benchmark, indent=2, ensure_ascii=False)) + buf.seek(0) keyword = "" diff --git a/api/routes/jobs.py b/api/routes/jobs.py index 4d392bc..935bc1f 100644 --- a/api/routes/jobs.py +++ b/api/routes/jobs.py @@ -80,6 +80,7 @@ async def get_job_data(job_id: str) -> dict: "seasonal_alerts": data.seasonal_alerts, "review_insights": data.review_insights, "story_arc": data.story_arc, + "benchmark": data.benchmark, } diff --git a/api/routes/listing.py b/api/routes/listing.py new file mode 100644 index 0000000..8d92646 --- /dev/null +++ b/api/routes/listing.py @@ -0,0 +1,78 @@ +"""Listing import routes: import pre-generated listing-gen projects.""" + +from __future__ import annotations + +import json +import logging +import os +import shutil +from pathlib import Path + +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/listing", tags=["listing"]) + +_REPORTS_DIR = Path(os.environ.get("REPORTS_DIR", "/tmp/superscrape_reports")) + + +class ImportRequest(BaseModel): + project_dir: str + + +@router.post("/jobs/{job_id}/import") +async def import_listing_project(job_id: str, req: ImportRequest) -> dict: + """Import an existing listing-gen project into a superscrape job. + + Reads brief.json, listing_text.json, and copies generated images + so they're accessible via the standard reports/static routes. + """ + src = Path(req.project_dir) + if not src.exists(): + raise HTTPException(status_code=404, detail=f"Project directory not found: {req.project_dir}") + + output_dir = src / "output" + + # Read brief.json + brief: dict = {} + brief_path = src / "brief.json" + if brief_path.exists(): + brief = json.loads(brief_path.read_text(encoding="utf-8")) + + # Read listing_text.json + listing_text: dict = {} + lt_path = output_dir / "listing_text.json" + if lt_path.exists(): + listing_text = json.loads(lt_path.read_text(encoding="utf-8")) + + # Read compliance_report.json + compliance: dict = {} + cr_path = output_dir / "compliance_report.json" + if cr_path.exists(): + compliance = json.loads(cr_path.read_text(encoding="utf-8")) + + # Copy generated images to REPORTS_DIR/{job_id}/images/ + images_dest = _REPORTS_DIR / job_id / "images" + images_dest.mkdir(parents=True, exist_ok=True) + + image_paths: list[dict[str, str]] = [] + if output_dir.exists(): + for img_file in sorted(output_dir.glob("slot_*.png")): + dst = images_dest / img_file.name + shutil.copy2(img_file, dst) + image_paths.append({ + "filename": img_file.name, + "path": f"/reports/{job_id}/images/{img_file.name}", + }) + + logger.info("Imported listing project %s → job %s (%d images)", req.project_dir, job_id, len(image_paths)) + + return { + "job_id": job_id, + "brief": brief, + "listing_text": listing_text, + "compliance": compliance, + "images": image_paths, + "images_count": len(image_paths), + } diff --git a/api/services/benchmark_service.py b/api/services/benchmark_service.py new file mode 100644 index 0000000..494fded --- /dev/null +++ b/api/services/benchmark_service.py @@ -0,0 +1,364 @@ +"""Category benchmark service — loads 80K pre-analyzed clothing image data. + +Provides category-level baselines (image types, angles, backgrounds, missing +slots, A+ adoption, price tiers) so that per-job scrape results can be +compared against the full dataset of 4,000+ products / 80,000+ images. +""" + +from __future__ import annotations + +import json +import logging +from collections import Counter, defaultdict +from pathlib import Path + +from superscrape.output.models import BenchmarkData + +logger = logging.getLogger(__name__) + +_DATA_DIR = Path(__file__).resolve().parent.parent.parent / "projects" / "clothing_images" + +# Pre-aggregated benchmarks keyed by L2 category name (e.g. "Active", "Jeans") +_benchmarks: dict[str, BenchmarkData] = {} +# Global (all-category) benchmark +_global_benchmark: BenchmarkData | None = None +_loaded = False + + +def _pct(count: int, total: int) -> float: + """Return percentage rounded to 1 decimal, safe against zero division.""" + return round(count / total * 100, 1) if total else 0.0 + + +def _load_data() -> None: + """Load and pre-aggregate the 80K dataset into per-category benchmarks.""" + global _benchmarks, _global_benchmark, _loaded + if _loaded: + return + + products_path = _DATA_DIR / "all_products.json" + images_path = _DATA_DIR / "image_analysis.jsonl" + product_analysis_path = _DATA_DIR / "product_analysis.jsonl" + aplus_path = _DATA_DIR / "aplus_analysis.jsonl" + + if not products_path.exists(): + logger.warning("Benchmark data not found at %s — skipping", _DATA_DIR) + _loaded = True + return + + # ── Step 1: Load product metadata (ASIN → category mapping) ────────── + logger.info("Loading benchmark products from %s ...", products_path) + with open(products_path) as f: + raw_products = json.load(f) + + asin_to_cat: dict[str, str] = {} # asin → L2 category + asin_to_catpath: dict[str, str] = {} + cat_product_counts: Counter[str] = Counter() + + for p in raw_products: + asin = p.get("asin", "") + cat_path = p.get("category_path", "") + parts = cat_path.split(" > ") + l2 = parts[1] if len(parts) >= 2 else "Other" + asin_to_cat[asin] = l2 + asin_to_catpath[asin] = cat_path + cat_product_counts[l2] += 1 + + total_products = len(raw_products) + logger.info("Loaded %d products across %d categories", total_products, len(cat_product_counts)) + + # ── Step 2: Aggregate image analysis per category ──────────────────── + # Per-category accumulators + cat_img_count: Counter[str] = Counter() + cat_img_type: dict[str, Counter] = defaultdict(Counter) + cat_angle: dict[str, Counter] = defaultdict(Counter) + cat_bg: dict[str, Counter] = defaultdict(Counter) + cat_has_person: Counter[str] = Counter() # images with model category + cat_has_text: Counter[str] = Counter() + cat_quality_sum: dict[str, float] = defaultdict(float) + cat_style_tags: dict[str, Counter] = defaultdict(Counter) + cat_reverse_prompts: dict[str, list] = defaultdict(list) + + if images_path.exists(): + logger.info("Loading image analyses from %s ...", images_path) + with open(images_path) as f: + for line in f: + try: + img = json.loads(line) + except json.JSONDecodeError: + continue + + asin = img.get("asin", "") + cat = asin_to_cat.get(asin, "Other") + + cat_img_count[cat] += 1 + cat_img_count["__all__"] += 1 + + # Image type (category field in image_analysis = image type) + img_type = img.get("category", "unknown") + cat_img_type[cat][img_type] += 1 + cat_img_type["__all__"][img_type] += 1 + + # Camera angle + comp = img.get("composition", {}) + angle = comp.get("camera_angle", "unknown") + cat_angle[cat][angle] += 1 + cat_angle["__all__"][angle] += 1 + + # Background + bg = comp.get("background", "unknown") + cat_bg[cat][bg] += 1 + cat_bg["__all__"][bg] += 1 + + # Person/model detection (image type == "model") + if img_type == "model": + cat_has_person[cat] += 1 + cat_has_person["__all__"] += 1 + + # Text overlay + text_info = img.get("text_overlay", {}) + if text_info.get("has_text"): + cat_has_text[cat] += 1 + cat_has_text["__all__"] += 1 + + # Quality score + qs = img.get("quality_score", 0) + cat_quality_sum[cat] += qs + cat_quality_sum["__all__"] += qs + + # Style tags + for tag in img.get("style_tags", []): + cat_style_tags[cat][tag] += 1 + cat_style_tags["__all__"][tag] += 1 + + # Reverse prompts (keep top ones per category, limit memory) + rp = img.get("reverse_prompt", "") + if rp and len(cat_reverse_prompts[cat]) < 50: + cat_reverse_prompts[cat].append({ + "prompt": rp, + "type": img_type, + "url": img.get("url", ""), + }) + + total_images = cat_img_count["__all__"] + logger.info("Loaded %d image analyses", total_images) + else: + total_images = 0 + + # ── Step 3: Load product analysis (missing slots, A+, price tiers) ─── + cat_missing: dict[str, Counter] = defaultdict(Counter) + cat_aplus_has: Counter[str] = Counter() + cat_aplus_score_sum: dict[str, float] = defaultdict(float) + cat_aplus_count: Counter[str] = Counter() + cat_price_tier: dict[str, Counter] = defaultdict(Counter) + cat_analysis_count: Counter[str] = Counter() + + if product_analysis_path.exists(): + logger.info("Loading product analyses from %s ...", product_analysis_path) + with open(product_analysis_path) as f: + for line in f: + try: + pa = json.loads(line) + except json.JSONDecodeError: + continue + + asin = pa.get("asin", "") + cat = asin_to_cat.get(asin, "Other") + cat_analysis_count[cat] += 1 + cat_analysis_count["__all__"] += 1 + + # Missing slots + missing = pa.get("main_sequence", {}).get("missing_slots", []) + for slot in missing: + cat_missing[cat][slot] += 1 + cat_missing["__all__"][slot] += 1 + + # A+ content + aplus = pa.get("aplus_content", {}) + if aplus.get("has_aplus"): + cat_aplus_has[cat] += 1 + cat_aplus_has["__all__"] += 1 + score = aplus.get("aplus_score", 0) + if score: + cat_aplus_score_sum[cat] += score + cat_aplus_score_sum["__all__"] += score + cat_aplus_count[cat] += 1 + cat_aplus_count["__all__"] += 1 + + # Price tier + tier = pa.get("brand_positioning", {}).get("price_tier", "unknown") + cat_price_tier[cat][tier] += 1 + cat_price_tier["__all__"][tier] += 1 + + # ── Step 4: Build BenchmarkData per category ───────────────────────── + def _build_benchmark(cat_key: str, display_name: str) -> BenchmarkData: + n_imgs = cat_img_count.get(cat_key, 0) + n_prods = cat_product_counts.get(cat_key, 0) if cat_key != "__all__" else total_products + n_analysis = cat_analysis_count.get(cat_key, 0) + + # Convert counters to percentage dicts + def _counter_to_pct(counter: Counter, total: int) -> dict[str, float]: + return {k: _pct(v, total) for k, v in counter.most_common()} + + # Missing slots as percentage of analyzed products + missing_pct = {k: _pct(v, n_analysis) for k, v in cat_missing.get(cat_key, Counter()).most_common()} + + # Top style tags + top_tags = [tag for tag, _ in cat_style_tags.get(cat_key, Counter()).most_common(10)] + + # Quality avg + q_avg = round(cat_quality_sum.get(cat_key, 0) / n_imgs, 2) if n_imgs else 0.0 + + # A+ stats + aplus_rate = _pct(cat_aplus_has.get(cat_key, 0), n_analysis) if n_analysis else 0.0 + aplus_avg = round( + cat_aplus_score_sum.get(cat_key, 0) / cat_aplus_count.get(cat_key, 1), 1 + ) if cat_aplus_count.get(cat_key, 0) else 0.0 + + return BenchmarkData( + category=display_name, + matched_categories=[cat_key] if cat_key != "__all__" else list(cat_product_counts.keys()), + total_products=n_prods, + total_images=n_imgs, + image_type_distribution=_counter_to_pct(cat_img_type.get(cat_key, Counter()), n_imgs), + angle_distribution=_counter_to_pct(cat_angle.get(cat_key, Counter()), n_imgs), + background_distribution=_counter_to_pct(cat_bg.get(cat_key, Counter()), n_imgs), + has_person_ratio=_pct(cat_has_person.get(cat_key, 0), n_imgs), + has_text_ratio=_pct(cat_has_text.get(cat_key, 0), n_imgs), + top_style_tags=top_tags, + quality_avg=q_avg, + missing_slots_ranking=missing_pct, + aplus_adoption_rate=aplus_rate, + aplus_avg_score=aplus_avg, + price_tier_distribution=_counter_to_pct(cat_price_tier.get(cat_key, Counter()), n_analysis), + top_reverse_prompts=cat_reverse_prompts.get(cat_key, [])[:10], + ) + + # Build per-category benchmarks + for cat_name in cat_product_counts: + _benchmarks[cat_name] = _build_benchmark(cat_name, cat_name) + + # Build global benchmark + _global_benchmark = _build_benchmark("__all__", "All Clothing") + _loaded = True + + logger.info( + "Benchmark service ready: %d categories, %d products, %d images", + len(_benchmarks), total_products, total_images, + ) + + +def get_categories() -> list[dict[str, int]]: + """Return list of available benchmark categories with product counts.""" + _load_data() + return [ + {"category": b.category, "total_products": b.total_products, "total_images": b.total_images} + for b in sorted(_benchmarks.values(), key=lambda b: b.total_products, reverse=True) + ] + + +def get_benchmark(keyword: str) -> BenchmarkData | None: + """Match a search keyword to the most relevant category benchmark. + + Matching strategy: + 1. Exact L2 category name match (case-insensitive) + 2. Keyword substring match against category names + 3. Fall back to global (all-category) benchmark + """ + _load_data() + if not _benchmarks: + return None + + kw_lower = keyword.lower() + + # Exact match + for cat_name, bench in _benchmarks.items(): + if cat_name.lower() == kw_lower: + return bench + + # Substring / keyword match — score each category + kw_words = set(kw_lower.split()) + best_score = 0 + best_cat = None + + _KEYWORD_MAP: dict[str, list[str]] = { + "Active": ["active", "activewear", "sport", "athletic", "workout", "gym", "running", "yoga", "legging", "shorts"], + "Jeans": ["jean", "jeans", "denim"], + "Fashion Hoodies & Sweatshirts": ["hoodie", "hoodies", "sweatshirt", "pullover", "fleece"], + "Baby Boys": ["baby", "boy", "infant", "toddler", "onesie"], + "Baby Girls": ["baby", "girl", "infant", "toddler"], + "Bodysuits": ["bodysuit", "romper", "jumpsuit"], + "Jackets & Coats": ["jacket", "coat", "parka", "windbreaker", "bomber", "puffer"], + "Underwear": ["underwear", "boxer", "brief", "panties"], + "Dresses": ["dress", "gown", "sundress", "maxi", "midi"], + "Socks & Hosiery": ["sock", "socks", "hosiery", "stocking"], + "Clothing Sets": ["set", "outfit", "matching"], + "Socks & Tights": ["tights", "leotard"], + "Swim": ["swim", "swimsuit", "swimwear", "bikini", "trunk"], + "Lingerie, Sleep & Lounge": ["lingerie", "pajama", "sleepwear", "lounge", "nightgown", "robe"], + "Suits & Sport Coats": ["suit", "blazer", "sport coat", "formal"], + "Coats, Jackets & Vests": ["vest", "gilet", "waistcoat"], + "Swimsuits & Cover Ups": ["cover up", "sarong", "beach"], + "Suiting & Blazers": ["suiting", "blazer"], + "Overalls": ["overall", "dungaree", "bib"], + } + + for cat_name, bench in _benchmarks.items(): + score = 0 + cat_words = set(cat_name.lower().split()) + + # Direct word overlap + overlap = kw_words & cat_words + score += len(overlap) * 3 + + # Keyword map matching + mapped_words = _KEYWORD_MAP.get(cat_name, []) + for w in kw_words: + if w in mapped_words: + score += 2 + # Partial match (e.g. "shirt" in "sweatshirt") + for mw in mapped_words: + if w in mw or mw in w: + score += 1 + + # Substring of category name + if kw_lower in cat_name.lower(): + score += 3 + + if score > best_score: + best_score = score + best_cat = cat_name + + if best_cat and best_score >= 2: + return _benchmarks[best_cat] + + # Fall back to global benchmark for weak/no matches + return _global_benchmark + + +def get_top_gaps(keyword: str, limit: int = 5) -> list[dict]: + """Return the top opportunity gaps for a category.""" + bench = get_benchmark(keyword) + if not bench: + return [] + + gaps: list[dict] = [] + for slot, pct in list(bench.missing_slots_ranking.items())[:limit]: + gaps.append({ + "slot": slot, + "missing_pct": pct, + "insight": f"{pct}% of products in this category are missing '{slot}' — adding one puts you ahead of the majority.", + }) + return gaps + + +def get_reverse_prompts(keyword: str, image_type: str = "", limit: int = 10) -> list[dict]: + """Return reference reverse prompts for a category, optionally filtered by image type.""" + bench = get_benchmark(keyword) + if not bench: + return [] + + prompts = bench.top_reverse_prompts + if image_type: + prompts = [p for p in prompts if p.get("type", "") == image_type] + return prompts[:limit] diff --git a/api/services/job_runner.py b/api/services/job_runner.py index 61ab6f9..8179730 100644 --- a/api/services/job_runner.py +++ b/api/services/job_runner.py @@ -121,6 +121,20 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None: platform=pval, ) + # ── Step 3.5: Load category benchmark ──────────────────────────── + benchmark = None + try: + from api.services.benchmark_service import get_benchmark + benchmark = get_benchmark(keyword) + if benchmark: + _emit( + job_id, PipelineStep.analyzing, + f"Loaded category benchmark ({benchmark.total_products} products, {benchmark.total_images} images)", + 73, platform=pval, + ) + except Exception as exc: + logger.warning("Benchmark loading failed (non-fatal): %s", exc) + # ── Step 4: Report ───────────────────────────────────────────────── _emit(job_id, PipelineStep.building_report, "Building category report...", 88, platform=pval) report = aggregate_report(keyword, products, analyses, platform=platform.value) @@ -141,6 +155,7 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None: analyses=analyses, lifestyle_images=lifestyle_images, language=req.language, + benchmark=benchmark, ) html_path.write_text(html_content, encoding="utf-8") @@ -167,10 +182,10 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None: from api.services.strategy_gen import generate_ab_tests, generate_action_plan review_insights = analyze_reviews(products) - action_plan = generate_action_plan(report, products) - ab_tests = generate_ab_tests(report, products) + action_plan = generate_action_plan(report, products, benchmark=benchmark) + ab_tests = generate_ab_tests(report, products, benchmark=benchmark) seasonal_alerts = get_seasonal_alerts(keyword) - story_arc = design_story_arc(report, products, review_insights) + story_arc = design_story_arc(report, products, review_insights, benchmark=benchmark) logger.info( "Strategy gen done: %d phases, %d tests, %d alerts, %d pain points, story arc=%s", len(action_plan), len(ab_tests), len(seasonal_alerts), @@ -193,6 +208,7 @@ def _run_pipeline_sync(job_id: str, req: JobRequest) -> None: seasonal_alerts=seasonal_alerts, review_insights=review_insights, story_arc=story_arc, + benchmark=benchmark.model_dump() if benchmark else None, ) report_url = f"/reports/{job_id}/html" diff --git a/api/services/job_store.py b/api/services/job_store.py index 06ddd65..a25048b 100644 --- a/api/services/job_store.py +++ b/api/services/job_store.py @@ -30,7 +30,7 @@ class JobData: """Container for pipeline output data (products, analyses, report, strategy).""" - __slots__ = ("products", "analyses", "report", "action_plan", "ab_tests", "seasonal_alerts", "review_insights", "story_arc") + __slots__ = ("products", "analyses", "report", "action_plan", "ab_tests", "seasonal_alerts", "review_insights", "story_arc", "benchmark") def __init__( self, @@ -42,6 +42,7 @@ def __init__( seasonal_alerts: list | None = None, review_insights: dict | None = None, story_arc: dict | None = None, + benchmark: dict | None = None, ) -> None: self.products = products self.analyses = analyses @@ -51,6 +52,7 @@ def __init__( self.seasonal_alerts = seasonal_alerts or [] self.review_insights = review_insights or {} self.story_arc = story_arc or {} + self.benchmark = benchmark or {} def _serialize_report(report: object) -> dict | None: @@ -106,6 +108,7 @@ def _load_persisted(self) -> None: seasonal_alerts=data_raw.get("seasonal_alerts", []), review_insights=data_raw.get("review_insights", {}), story_arc=data_raw.get("story_arc", {}), + benchmark=data_raw.get("benchmark", {}), ) loaded += 1 except Exception as e: @@ -153,6 +156,7 @@ def _persist_data(self, job_id: str) -> None: "seasonal_alerts": data.seasonal_alerts, "review_insights": data.review_insights, "story_arc": data.story_arc, + "benchmark": data.benchmark, } path = data_dir / f"{job_id}.json" @@ -266,6 +270,7 @@ def save_data( seasonal_alerts: list | None = None, review_insights: dict | None = None, story_arc: dict | None = None, + benchmark: dict | None = None, ) -> None: """Store structured pipeline output for later retrieval via /data endpoint.""" with self._lock: @@ -278,6 +283,7 @@ def save_data( seasonal_alerts=seasonal_alerts, review_insights=review_insights, story_arc=story_arc, + benchmark=benchmark, ) self._persist_data(job_id) diff --git a/api/services/listing_service.py b/api/services/listing_service.py index f1ddb72..bac9597 100644 --- a/api/services/listing_service.py +++ b/api/services/listing_service.py @@ -20,13 +20,14 @@ from superscrape.listing_gen.category_config import get_category_config from superscrape.listing_gen.models import ( BrandConfig, + ColorVariant, ImageSlotSpec, ProjectConfig, SceneRequirement, SlotMethod, SlotType, ) -from superscrape.output.models import CategoryVisualReport, ScrapedItem +from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem logger = logging.getLogger(__name__) @@ -226,6 +227,7 @@ def auto_generate_config( name=brand_name, colors=brand_colors or [], ), + color_variants=[ColorVariant(name="default", photos=photo_map)] if photo_map else [], main_images=slots, scenes=scenes, size_data=None, # Can be enhanced later @@ -282,6 +284,7 @@ def generate_listing_text( products: list[ScrapedItem], report: CategoryVisualReport, brand_name: str = "", + benchmark: BenchmarkData | None = None, ) -> dict: """Generate Amazon listing text (title, bullets, description) using GPT-5.2. @@ -335,6 +338,15 @@ def generate_listing_text( user_parts.append(f"\nPrice range: {price_range}") user_parts.append(f"\nHigh-frequency keywords: {', '.join(common_keywords[:15])}") user_parts.append(f"\nVisual insights: {report.has_person_ratio}% show models, {report.has_text_ratio}% use text overlays") + + if benchmark and benchmark.total_products > 0: + user_parts.append(f"\nCategory benchmark ({benchmark.total_products:,} products):") + if benchmark.top_style_tags: + user_parts.append(f" Dominant styles: {', '.join(benchmark.top_style_tags[:6])}") + if benchmark.price_tier_distribution: + tiers = [f"{t}: {p}%" for t, p in list(benchmark.price_tier_distribution.items())[:3]] + user_parts.append(f" Price tiers: {', '.join(tiers)}") + user_parts.append(f"\nGenerate the complete Amazon listing text. Return ONLY JSON.") try: diff --git a/api/services/story_arc.py b/api/services/story_arc.py index f22a9bd..9e265cb 100644 --- a/api/services/story_arc.py +++ b/api/services/story_arc.py @@ -12,7 +12,7 @@ from openai import OpenAI -from superscrape.output.models import CategoryVisualReport, ScrapedItem +from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem logger = logging.getLogger(__name__) @@ -66,6 +66,7 @@ def design_story_arc( report: CategoryVisualReport, products: list[ScrapedItem], review_insights: dict | None = None, + benchmark: BenchmarkData | None = None, ) -> dict: """Design a 7-image story arc based on competitive intelligence. @@ -119,6 +120,19 @@ def design_story_arc( for p in products[:5]: parts.append(f" - {p.title[:70]}... | {p.price} | {p.rating}★") + # Category benchmark (80K image dataset) + if benchmark and benchmark.total_products > 0: + parts.append(f"\n--- CATEGORY BENCHMARK ({benchmark.total_products:,} products / {benchmark.total_images:,} images) ---") + if benchmark.missing_slots_ranking: + parts.append("Most commonly missing image slots:") + for slot, pct in list(benchmark.missing_slots_ranking.items())[:5]: + parts.append(f" {slot}: {pct}% of products are missing this") + parts.append(f"A+ adoption: {benchmark.aplus_adoption_rate}%, avg score: {benchmark.aplus_avg_score}/9") + parts.append(f"Quality avg: {benchmark.quality_avg}/5") + if benchmark.top_style_tags: + parts.append(f"Category style: {', '.join(benchmark.top_style_tags[:6])}") + parts.append("IMPORTANT: Use missing slot data to prioritize which images to include. If 87% are missing size_chart, recommend it.") + context = "\n".join(parts) try: diff --git a/api/services/strategy_gen.py b/api/services/strategy_gen.py index bf720e6..ce602b0 100644 --- a/api/services/strategy_gen.py +++ b/api/services/strategy_gen.py @@ -8,7 +8,7 @@ from openai import OpenAI -from superscrape.output.models import CategoryVisualReport, ScrapedItem +from superscrape.output.models import BenchmarkData, CategoryVisualReport, ScrapedItem logger = logging.getLogger(__name__) @@ -42,6 +42,7 @@ def _get_client() -> OpenAI: def _build_context( report: CategoryVisualReport, products: list[ScrapedItem], + benchmark: BenchmarkData | None = None, ) -> str: """Build a concise context string from the report data.""" parts: list[str] = [] @@ -81,15 +82,42 @@ def _build_context( for rec in report.recommendations: parts.append(f" - {rec}") + # Category benchmark from 80K image dataset + if benchmark and benchmark.total_products > 0: + parts.append(f"\n--- CATEGORY BENCHMARK (from {benchmark.total_products:,} products / {benchmark.total_images:,} images) ---") + parts.append(f"Category: {benchmark.category}") + + if benchmark.missing_slots_ranking: + parts.append("\nMost commonly missing image slots (% of products missing it):") + for slot, pct in list(benchmark.missing_slots_ranking.items())[:5]: + parts.append(f" {slot}: {pct}% missing") + + parts.append(f"\nBenchmark quality average: {benchmark.quality_avg}/5") + parts.append(f"A+ Content adoption: {benchmark.aplus_adoption_rate}%") + if benchmark.aplus_avg_score: + parts.append(f"A+ average score: {benchmark.aplus_avg_score}/9") + + if benchmark.price_tier_distribution: + parts.append("\nPrice tier distribution:") + for tier, pct in benchmark.price_tier_distribution.items(): + parts.append(f" {tier}: {pct}%") + + if benchmark.top_style_tags: + parts.append(f"\nTop style tags: {', '.join(benchmark.top_style_tags[:8])}") + + parts.append("\nUSE THIS BENCHMARK DATA to make specific, data-backed recommendations.") + parts.append("Reference exact percentages and compare the user's competitors against the full category.") + return "\n".join(parts) def generate_action_plan( report: CategoryVisualReport, products: list[ScrapedItem], + benchmark: BenchmarkData | None = None, ) -> list[dict]: """Generate a dynamic action plan based on competitive intelligence.""" - context = _build_context(report, products) + context = _build_context(report, products, benchmark=benchmark) try: resp = _get_client().chat.completions.create( @@ -124,9 +152,10 @@ def generate_action_plan( def generate_ab_tests( report: CategoryVisualReport, products: list[ScrapedItem], + benchmark: BenchmarkData | None = None, ) -> list[dict]: """Generate A/B test suggestions based on competitive intelligence.""" - context = _build_context(report, products) + context = _build_context(report, products, benchmark=benchmark) try: resp = _get_client().chat.completions.create( diff --git a/docker-compose.yml b/docker-compose.yml index 22d8dd0..8d7ec7e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,6 +10,8 @@ services: environment: - OPENAI_API_KEY=${OPENAI_API_KEY} - BYTEPLUSES_API_KEY=${BYTEPLUSES_API_KEY:-} + - PROXY_URL=${PROXY_URL:-} + - CAMOUFOX_PATH=${CAMOUFOX_PATH:-} - REPORTS_DIR=/app/reports volumes: - reports:/app/reports diff --git a/frontend/app/api/proxy/[...path]/route.ts b/frontend/app/api/proxy/[...path]/route.ts index 96fb676..ace2162 100644 --- a/frontend/app/api/proxy/[...path]/route.ts +++ b/frontend/app/api/proxy/[...path]/route.ts @@ -87,15 +87,22 @@ export async function POST(request: NextRequest, context: RouteContext) { const url = buildBackendUrl(pathParts, request.nextUrl.searchParams); let body: BodyInit | null = null; + const headers: Record = {}; const contentType = request.headers.get("content-type") ?? ""; - if (contentType.includes("application/json")) { + + if (contentType.includes("multipart/form-data")) { + // Stream multipart body directly to preserve boundary + body = request.body; + headers["content-type"] = contentType; + } else if (contentType.includes("application/json")) { body = await request.text(); + headers["content-type"] = "application/json"; } try { const backendRes = await fetch(url, { method: "POST", - headers: { "Content-Type": "application/json" }, + headers, body: body ?? undefined, cache: "no-store", }); diff --git a/frontend/components/DataDashboard.tsx b/frontend/components/DataDashboard.tsx index 332e3ac..03263f8 100644 --- a/frontend/components/DataDashboard.tsx +++ b/frontend/components/DataDashboard.tsx @@ -75,6 +75,23 @@ interface SeasonalAlert { days_until: number; } +interface BenchmarkData { + category: string; + total_products: number; + total_images: number; + image_type_distribution: Record; + angle_distribution: Record; + background_distribution: Record; + has_person_ratio: number; + has_text_ratio: number; + top_style_tags: string[]; + quality_avg: number; + missing_slots_ranking: Record; + aplus_adoption_rate: number; + aplus_avg_score: number; + price_tier_distribution: Record; +} + interface JobData { products: Array<{ title: string; price: string; rating: number; reviews_count: number; reviews?: Array<{ text: string; stars: number }> }>; analyses: Array<{ image_type: string; selling_point_angle: string; info_hierarchy: string; text_coverage_pct: number }>; @@ -84,12 +101,14 @@ interface JobData { seasonal_alerts: SeasonalAlert[]; review_insights: ReviewInsights; story_arc: StoryArc; + benchmark: BenchmarkData | null; } -type Tab = "overview" | "reviews" | "story_arc" | "action_plan" | "ab_tests"; +type Tab = "overview" | "benchmark" | "reviews" | "story_arc" | "action_plan" | "ab_tests"; const TAB_LABELS: Record = { overview: "Overview", + benchmark: "Benchmark", reviews: "Review Insights", story_arc: "7-Image Strategy", action_plan: "Action Plan", @@ -166,7 +185,9 @@ export default function DataDashboard({ jobId }: { jobId: string }) { {/* Tab navigation */}
- {(Object.keys(TAB_LABELS) as Tab[]).map((t) => ( + {(Object.keys(TAB_LABELS) as Tab[]).filter((t) => + t !== "benchmark" || (data.benchmark && data.benchmark.total_products > 0) + ).map((t) => (