From 71bed084c824c0909614cdc9b00c9056611011fa Mon Sep 17 00:00:00 2001 From: Eclair Ji Li Date: Fri, 29 May 2026 17:20:23 +0800 Subject: [PATCH 1/2] Add SG grocery product-search domain skills (Shopee, Lazada, FairPrice) Field-tested playbooks for searching and extracting frozen ramen/udon SKUs from the three Singapore grocery sites, captured during a competitor survey: - shopee/product-search.md: login wall, blocked search API, lazy-rendered grid, innerText price-line parser, IPC/timeout recovery ladder - lazada/product-search.md: price-anchored DOM walk, large-js()-returns-None payload cap, PDP spec-grid parser, card-price != RSP, RedMart = Lazada - fairprice/product-search.md: clean /product/ anchors, richest PDP (origin + storage fields), fuzzy-search filtering Also adds reusable Lazada/Shopee list + PDP extractors to agent_helpers.py. Co-Authored-By: Claude Opus 4.7 --- agent-workspace/agent_helpers.py | 141 ++++++++++++- .../domain-skills/fairprice/product-search.md | 151 +++++++++++++ .../domain-skills/lazada/product-search.md | 199 ++++++++++++++++++ .../domain-skills/shopee/product-search.md | 176 ++++++++++++++++ 4 files changed, 664 insertions(+), 3 deletions(-) create mode 100644 agent-workspace/domain-skills/fairprice/product-search.md create mode 100644 agent-workspace/domain-skills/lazada/product-search.md create mode 100644 agent-workspace/domain-skills/shopee/product-search.md diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 2d493c17..414e4a04 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -1,7 +1,142 @@ """Agent-editable browser helpers. -Add task-specific browser primitives here. Core helpers from browser_harness.helpers -load this file when BH_AGENT_WORKSPACE points at this directory, or when this -repo's default agent-workspace exists. +Task-specific extractors for the frozen ramen/udon competitor survey. +Keeping JS here as plain strings avoids shell/heredoc escaping problems. """ +import json + +def _eval(expr): + from browser_harness.helpers import js + return js(expr) + + +def laz_list(limit=60): + """Lazada: price-anchored card extraction. Walk up from each price node + to a card container, grab its title. Small payload (filtered in JS).""" + expr = r""" + (function(){ + var out=[]; var seen={}; + var all=document.querySelectorAll('*'); + for (var i=0;i6) break; + } + } + if (title.length<6) continue; + var key=title.slice(0,60); + if (seen[key]) continue; seen[key]=1; + var link=''; + var la=card.querySelector('a[href*=".html"]'); + if (la) link=la.href.split('?')[0]; + out.push({name:title.slice(0,130), price:t, href:link}); + if (out.length>=LIMIT) break; + } + return JSON.stringify(out); + })() + """.replace("LIMIT", str(limit)) + r = _eval(expr) + return json.loads(r) if r else [] + + +def shopee_list(): + """Return list of {name, price, href} for a Shopee search page.""" + expr = r""" + (function(){ + var out=[]; var seen={}; + var items=document.querySelectorAll('li.shopee-search-item-result__item, .shopee-search-item-result__item'); + if(!items.length){ + var anchors=document.querySelectorAll('a'); + for (var i=0;i4) out.push({name:txt.slice(0,140), price:'', href:key}); + } + return JSON.stringify(out); + } + for (var i=0;i 20: + pass + # title: the heading repeats right after breadcrumb; take the longest + # line that looks like a product title + def nextval(i): + for j in range(i + 1, min(i + 5, len(lines))): + if lines[j].strip(): + return lines[j].strip() + return "" + for i, l in enumerate(lines): + if l.startswith("Brand:"): + d["brand"] = l.replace("Brand:", "").split("More ")[0].strip() + if l == "Pack Size": + d["pack_size"] = nextval(i) + if l == "Net Weight" and not d["pack_size"]: + d["pack_size"] = nextval(i) + if l == "Place of Origin": + d["origin"] = nextval(i) + if l == "Product Type": + d["ptype"] = nextval(i) + if l.startswith("Sold by"): + d["soldby"] = l.replace("Sold by", "").strip() + # price: the $x.xx just before the first "Add to cart" (skip $60 banner) + cut = txt.find("Add to cart") + seg = txt[:cut] if cut > 0 else txt + prices = re.findall(r"\$[\d,]+\.\d{2}", seg) + prices = [p for p in prices if p != "$60.00"] + if prices: + d["price"] = prices[-1] + elif re.search(r"\$[\d,]+\.\d{2}", txt): + d["price"] = re.search(r"\$[\d,]+\.\d{2}", txt).group(0) + # title: last segment of the breadcrumb line (tab-joined) + for l in lines: + if "\t" in l and ("Groceries" in l or "Frozen" in l): + d["title"] = l.split("\t")[-1].strip() + d["breadcrumb"] = " > ".join(l.split("\t")) + break + if not d["title"]: + cands = [l for l in lines[:30] if len(l) > 15 and any(c.isalpha() for c in l) + and "FEEDBACK" not in l and "delivery" not in l.lower()] + if cands: + d["title"] = max(cands, key=len) + return d diff --git a/agent-workspace/domain-skills/fairprice/product-search.md b/agent-workspace/domain-skills/fairprice/product-search.md new file mode 100644 index 00000000..0271bd5e --- /dev/null +++ b/agent-workspace/domain-skills/fairprice/product-search.md @@ -0,0 +1,151 @@ +# FairPrice (fairprice.com.sg) — Product Search & Data Extraction + +Field-tested against www.fairprice.com.sg on 2026-05-29 via a focused recon pass +(a few frozen ramen/udon searches + one PDP) during a competitor price survey. Coverage of +edge cases is lighter than the Shopee/Lazada skills, but the core search → extract → PDP +flow below is confirmed working. FairPrice SG is the **easiest of the three SG grocery +sites**: no login wall, no bot/traffic wall, no age popup, and a light page (~50 anchors, +~1–2k chars of text). + +## Navigation + +### Search URL (no login needed) +```python +goto_url("https://www.fairprice.com.sg/search?query=frozen%20ramen") # spaces = %20 (or +) +wait(6) # light page; results render in ~5-6s. wait_for_load() also tends to work here. +``` +- Title becomes `Results For | FairPrice`; body shows `Results for ""`. +- No delivery address/postal code is required to see prices (the "Enter your address" / + "Fees may apply" prompts do **not** block results). +- **Search is fuzzy** — a "frozen udon" query returned a shrimp-fritter ("Bakwan Udang"). + Always filter results by name relevance. + +### Product detail URL +Pattern: `/product/-`, e.g. +`/product/shimadaya-frozen-shoyu-ramen-noodles-with-soup-stock-frozen-477g-90016285` +(some house-brand items are slug-only, e.g. `/product/chicken-collagen-ramen---frozen`). +```python +goto_url("https://www.fairprice.com.sg/product/shimadaya-frozen-shoyu-ramen-noodles-with-soup-stock-frozen-477g-90016285") +wait(5) +``` + +## Search results extraction + +Unlike Lazada, FairPrice result cards use **clean `/product/` anchors**, so anchor on those +and climb to the card that contains the price. Each card's innerText is conveniently rich: +`$ By Add to cart` (or `... Out of stock`). + +```python +import json +expr = r""" +(function(){ + var out=[], seen={}; + var as=document.querySelectorAll('a[href*="/product/"]'); + for(var i=0;i=40) break; + } + return JSON.stringify(out); +})() +""" +items = json.loads(js(expr) or "[]") +``` + +Field notes: +- **`price`**: take the **first** `$x.xx` in the card text — that's the selling price. + FairPrice has no "$60 free-delivery" banner (the Lazada trap), so first-match is safe. +- **Card text begins with the price**, then the name. To get a clean name, strip the leading + price token: `name = re.sub(r'^\$[\d,]+\.\d{2}\s*', '', card)` then cut at ` By ` / + ` Add to cart` / ` Out of stock`. +- **Pack size is inline** in the card (e.g. `477 G`, `150 G`, `2 x 150 G`) — already captured + in the card text, no PDP needed for size. +- Result sets are **small and low-noise** (8 for "frozen ramen", 4 for "frozen udon") — far + cleaner than Shopee. No pagination/"load more" surfaced for these queries; treat the grid + as a single set. + +## Product detail page (PDP) — richest of the three sites + +PDP innerText carries labelled fields, each **value on the line after its label** (except +`Brand:` which is inline): + +``` +Home Frozen Frozen Food Ready Meals <- breadcrumb +$8.80 <- price (first $ on page) +Shimadaya Frozen Shoyu Ramen Noodles with Soup Stock <- name +477 G <- pack size +Brand:Shimadaya +Sold by: +Kirei Japanese Food Supply <- marketplace seller +KEY INFORMATION + +COUNTRY/PLACE OF ORIGIN +Japan +STORAGE INFORMATION +Keep frozen at or below -15 degree C ... <- confirms frozen vs chilled +``` + +Parser: +```python +import re +txt = js("document.body.innerText") or "" +lines=[l.strip() for l in txt.split("\n")] +def after(label): + for i,l in enumerate(lines): + if l==label: + for j in range(i+1, min(i+4,len(lines))): + if lines[j].strip(): return lines[j].strip() + return "" +d={} +m=re.search(r"\$[\d,]+\.\d{2}", txt); d["price"]=m.group(0) if m else "" +for l in lines: + if l.startswith("Brand:"): d["brand"]=l.split("Brand:")[1].strip() +d["origin"] = after("COUNTRY/PLACE OF ORIGIN") +d["storage"] = after("STORAGE INFORMATION") # tells frozen vs chilled +d["desc"] = after("KEY INFORMATION") +d["soldby"] = after("Sold by:") +# name/packsize: the two non-empty lines right after the price line +``` + +Fields you can reliably pull: **price, name, pack size, Brand, Sold-by seller, +COUNTRY/PLACE OF ORIGIN, STORAGE INFORMATION (frozen/chilled), KEY INFORMATION +(description/flavour/portions)**. This is the only one of the three sites that exposes a +storage line — useful to confirm a product is genuinely frozen. + +## Coverage note + +FairPrice carries SKUs **not** on Lazada (e.g. "Shimadaya Reito Tenobe Masari Frozen Udon", +"Kirei Premium Shoyu Tonkotsu Ramen") and overlaps on others (Little Totler, Shimadaya +Shoyu, Kirei, Daisho). Use it as a **complementary source** alongside Lazada/RedMart — +don't assume Lazada coverage is a superset. + +## Tab hygiene + +Open once, reuse with `goto_url`, close when done — don't `new_tab` per search. +```python +tid = new_tab("https://www.fairprice.com.sg/search?query=frozen%20ramen"); wait(6) +# reuse with goto_url for each subsequent query / PDP ... +close_tab(tid) +# end-of-task sweep: +for t in list_tabs(include_chrome=False): + if "fairprice.com.sg" in (t.get("url") or ""): close_tab(t["targetId"]) +``` + +## Gotchas (field-tested) + +- **No login / bot / age wall** — easiest of the three SG sites; anonymous search works. +- **Light, fast page** — `wait(5-6)` is plenty; no full-page-screenshot or scroll-hang issues like Shopee. +- **`/product/` anchors are clean** — anchor on them directly (no price-walk hack needed, unlike Lazada). +- **Card text starts with the price** — strip the leading `$x.xx` to get the name; pack size is inline in the card. +- **First `$` on a PDP IS the price** — no delivery-banner trap (unlike Lazada's `$60.00`). +- **PDP labels → value on the next line**; `Brand:` is inline. `STORAGE INFORMATION` confirms frozen vs chilled. +- **Fuzzy search** — irrelevant items can appear (e.g. "Udang" matched "udon"); filter by name. +- **Marketplace sellers** ("Sold by: ...") — many Japanese frozen items are 3rd-party marketplace listings, not FairPrice house stock. +- **Unique catalogue** — has SKUs absent from Lazada; treat as complementary, not redundant. +- **Reuse one tab, close when done.** diff --git a/agent-workspace/domain-skills/lazada/product-search.md b/agent-workspace/domain-skills/lazada/product-search.md new file mode 100644 index 00000000..ed5dc24a --- /dev/null +++ b/agent-workspace/domain-skills/lazada/product-search.md @@ -0,0 +1,199 @@ +# Lazada (lazada.sg) & RedMart — Product Search & Data Extraction + +Field-tested against www.lazada.sg on 2026-05-29 (frozen ramen/udon competitor survey). +Unlike Shopee, Lazada SG search works **without login** and is far more reliable. The two +things that wasted calls were (1) large `js()` payloads silently returning `None`, and +(2) product links not following one clean pattern — both solved below. + +RedMart lives inside Lazada: `redmart.lazada.sg` redirects to `www.lazada.sg`, RedMart +items appear in normal Lazada search (with a RedMart badge/tab), and clicking one lands on +a `www.lazada.sg/products/...html` PDP with RedMart delivery UI. Treat RedMart as Lazada. + +## Navigation + +### Search URL (primary entry point — no login needed) +```python +goto_url("https://www.lazada.sg/catalog/?q=frozen+ramen") # spaces = + +wait(4) +``` +- `/catalog/?q=X` frequently 302s to `/tag/X/?q=X&catalog_redirect_tag=true`. That's normal — + results still load. Confirm with `page_info()["url"]`. +- Product detail URL pattern: `https://www.lazada.sg/products/pdp-i.html` + (sometimes `...-i-s.html`). + +### Load the whole result grid before extracting +Lazada lazy-loads cards on scroll. `window.scrollTo` works here (unlike Shopee), so step +through a few offsets: +```python +for y in (400, 1200, 2200, 3200): + js("window.scrollTo(0, %d)" % y); wait(0.6) +``` + +## Two traps that cost real time + +### Trap 1 — large `js()` results come back as `None` (IPC size cap) +`js("JSON.stringify(Array.from(document.querySelectorAll('a')).map(...))")` over **all** +anchors (1000+ on a Lazada page) returns `None`/empty: the response exceeds the IPC +`recv(1<<16)` framing and is dropped. **Always filter and cap inside the JS** so the +returned payload is small (≤ ~60 short items). Small `js()` reads (a `.length`, one field) +work fine; it's only the big serialisations that fail. + +### Trap 2 — product links have no single clean selector +On search/tag pages, `/products/` anchors are mostly the 2 sponsored ads; the real result +cards use varying markup, so href-filtering misses them. **Anchor on the price instead.** +Find leaf nodes whose text is exactly a price, climb to the enclosing card, read its title +link. This reliably yielded ~39 products per page: + +```python +import json +expr = r""" +(function(){ + var out=[], seen={}; + var all=document.querySelectorAll('*'); + for (var i=0;i6) break; } + } + if (title.length<6) continue; + var key=title.slice(0,60); if(seen[key]) continue; seen[key]=1; + var la=card.querySelector('a[href*=".html"]'); if(la) link=la.href.split('?')[0]; + out.push({name:title.slice(0,130), price:t, href:link}); + if (out.length>=60) break; // cap payload — see Trap 1 + } + return JSON.stringify(out); +})() +""" +items = json.loads(js(expr) or "[]") +``` + +Note the JS escapes (`\$`, `\d`, `\s`) survive a `cat <<'PY'` heredoc because the delimiter +is quoted. If you instead keep the extractor in `agent-workspace/agent_helpers.py`, you +avoid all heredoc-escaping risk entirely (recommended for repeated runs). + +## Single-invocation rule (avoids stale-tab `None`) + +Each `browser-harness < <- tab-joined breadcrumb + +Brand:<Brand>More Frozen from <Brand> +Pack Size + +477 g +Place of Origin + +Japan +Product Type + +Frozen +$8.80 <- real price (the $60.00 above is the "free delivery" banner) +Add to cart +... +Sold by <seller> +``` + +Parser: +```python +import re +lines=[l.strip() for l in txt.split("\n")] +def nextval(i): # value sits 1-4 lines below its label (blanks between) + for j in range(i+1, min(i+5,len(lines))): + if lines[j].strip(): return lines[j].strip() + return "" +d={"title":"","brand":"","pack_size":"","origin":"","ptype":"","price":"","soldby":""} +for i,l in enumerate(lines): + if l.startswith("Brand:"): d["brand"]=l.replace("Brand:","").split("More ")[0].strip() + if l=="Pack Size": d["pack_size"]=nextval(i) + if l=="Net Weight" and not d["pack_size"]: d["pack_size"]=nextval(i) + if l=="Place of Origin": d["origin"]=nextval(i) + if l=="Product Type": d["ptype"]=nextval(i) + if l.startswith("Sold by"): d["soldby"]=l.replace("Sold by","").strip() +# price: the $x.xx just before "Add to cart", skipping the $60.00 free-delivery banner +cut=txt.find("Add to cart"); seg=txt[:cut] if cut>0 else txt +ps=[p for p in re.findall(r"\$[\d,]+\.\d{2}", seg) if p!="$60.00"] +if ps: d["price"]=ps[-1] +# title: last segment of the tab-joined breadcrumb line +for l in lines: + if "\t" in l and ("Groceries" in l or "Frozen" in l): + d["title"]=l.split("\t")[-1].strip(); break +``` + +Fields available: Brand, Pack Size (or Net Weight), Place of Origin, Product Type +(e.g. "Frozen"), price, Sold-by seller, breadcrumb category. RedMart-fulfilled listings +(sellers like "Kirei Japanese Food Supply", "Soon Seng Huat") reliably populate the grid; +so do most marketplace grocery sellers once you scroll far enough. + +## Price gotcha: card price ≠ RSP + +Search-result cards may show a **voucher/promo** price (e.g. $6.80) while the PDP shows the +true RSP (e.g. $7.90). For an accurate survey, open the PDP and read its price for RSP, and +record the card price as the promo. The "Platinum save $X / Coin save $X" lines in card +text are loyalty deltas, not the price — ignore them. + +## Age-verification popup + +Some queries trigger a "you must be at least 21 years" modal that blocks the grid. Dismiss +by button text (don't click pixels — layout shifts): +```python +js("Array.from(document.querySelectorAll('button')).find(b=>b.innerText==='Over 21')?.click()") +wait(1) +``` + +## Tab hygiene + +Reuse one tab; don't `new_tab` per search. Close when done. +```python +tid = new_tab("https://www.lazada.sg/catalog/?q=frozen+ramen"); wait(4) +# ... reuse with goto_url for each subsequent query ... +close_tab(tid) +# end-of-task sweep: +for t in list_tabs(include_chrome=False): + if "lazada.sg" in t["url"]: close_tab(t["targetId"]) +``` + +## Exhaustiveness tip + +One keyword misses SKUs. Run several phrasings and union by product name, e.g. for ramen: +`"frozen ramen"`, `"frozen ramen noodle soup"`, `"japanese frozen ramen"`; for udon: +`"frozen udon"`, `"frozen udon noodle soup"`, `"sanuki udon frozen"`. Lazada/RedMart had +far deeper frozen Japanese-noodle coverage than Shopee — it's the primary source for this +category. + +## Gotchas (field-tested) + +- **No login wall** — anonymous catalog search works (the opposite of Shopee). +- **Big `js()` serialisations return `None`** — filter + cap (≤~60 items) inside the JS. +- **Price-anchored extraction** beats href/selector hunting on result pages. +- **One invocation per nav+extract** — separate calls re-attach and may return `None`. +- **PDP spec values are 1-4 lines below the label** (blank lines between) — scan forward. +- **First `$` on a PDP is the `$60.00` delivery banner** — take the price before "Add to cart". +- **Card price may be a voucher price**, not RSP — confirm RSP on the PDP. +- **`/catalog/` 302s to `/tag/`** — expected, results still load. +- **Age popup** on some queries — click the "Over 21" button by text. +- **RedMart = Lazada** — same search and PDP; `redmart.lazada.sg` redirects to `www.lazada.sg`. +- **Reuse one tab, close when done** — avoid tab pile-up. diff --git a/agent-workspace/domain-skills/shopee/product-search.md b/agent-workspace/domain-skills/shopee/product-search.md new file mode 100644 index 00000000..ec36b701 --- /dev/null +++ b/agent-workspace/domain-skills/shopee/product-search.md @@ -0,0 +1,176 @@ +# Shopee (shopee.sg) — Product Search & Data Extraction + +Field-tested against shopee.sg on 2026-05-29 using a **logged-in** Chrome session +(competitor price survey for frozen ramen/udon). Shopee SG is heavy, lazy-rendered, +and bot-defended. The patterns below are what actually survived; the naive Playwright +reflexes (wait_for_load, full screenshot, JS scroll, search API) all fail here. + +## Prerequisites — you MUST be logged in + +Anonymous sessions are blocked. Navigating to any `shopee.sg/...` URL while logged out +redirects to a traffic-verification wall: + +``` +https://shopee.sg/verify/traffic/error?...&is_logged_in=false&...&type=4 +title: "Shopee Singapore | Cheaper, Faster On Shopee" +body : "Page Unavailable — Looks like you're not logged in yet." +``` + +There is no programmatic bypass. If you land on `/verify/traffic/error`, stop and ask the +user to log into Shopee in their Chrome, then retry. Detect it early: + +```python +def shopee_blocked(): + return "/verify/traffic/error" in page_info()["url"] +``` + +The search API is also blocked — do **not** waste a call on it: +```python +# http_get("https://shopee.sg/api/v4/search/search_items?keyword=...") -> HTTP 403 +``` +All extraction must go through the rendered DOM in the logged-in browser. + +## Tab hygiene (do this — Shopee tabs pile up fast) + +Open Shopee **once** with `new_tab`, then reuse that one tab with `goto_url` for every +subsequent search. Close it when the task is done. Do NOT call `new_tab` per search — that +is how you end up with dozens of orphan tabs. + +```python +# first search of the session +tid = new_tab("https://shopee.sg/search?keyword=frozen%20ramen") +wait(8) + +# every later search: reuse the SAME tab +goto_url("https://shopee.sg/search?keyword=frozen%20udon") +wait(8) + +# when finished with Shopee entirely +close_tab(tid) # or close_tab() to close the current tab +``` + +If you spawned strays during exploration, sweep them at the end: +```python +for t in list_tabs(include_chrome=False): + if "shopee.sg" in t["url"]: + close_tab(t["targetId"]) +``` + +## Navigation + +### Search URL (only reliable entry point) +```python +goto_url("https://shopee.sg/search?keyword=frozen%20ramen%20soup") # spaces = %20 (or +) +wait(8) # NOT wait_for_load() — see Gotchas +``` +- Pagination: append `&page=N` (0-indexed). The UI shows a `1/N` page counter. +- Sort: the UI exposes Relevance / Latest / Top Sales / Price tabs; relevance (default) is fine + for surveys. Sorting via URL params is unreliable — click the tab if you must. + +### Product detail page +Product URLs carry the id pattern `...-i.<shopId>.<itemId>`: +```python +goto_url("https://shopee.sg/product-name-i.123456.7890123") +wait(7) +``` + +## The three things that WILL bite you + +### 1. `wait_for_load()` times out — use a fixed `wait()` +Shopee's main thread stays busy long after `readyState=complete`, so the +`js("document.readyState")` poll inside `wait_for_load()` raises a CDP timeout. +Use a hard `wait(7)`–`wait(9)` after navigation instead. + +### 2. Full-page screenshots time out — use `full=False` +The results page is very tall; `capture_screenshot()` (full page) exceeds the IPC +deadline. Always pass `full=False` for a viewport-only grab: +```python +img = capture_screenshot(full=False) +``` + +### 3. The result grid lazy-renders, and `window.scrollTo` can hang +After navigation the SEARCH FILTER sidebar appears but the product grid is blank until +the viewport scrolls. `js("window.scrollTo(0, N)")` itself sometimes times out because +the main thread is blocked. The reliable nudge is a CDP **keyboard End** event, then wait: + +```python +cdp("Input.dispatchKeyEvent", type="keyDown", key="End", windowsVirtualKeyCode=35) +cdp("Input.dispatchKeyEvent", type="keyUp", key="End", windowsVirtualKeyCode=35) +wait(3) +``` + +A blank grid shows `len(js("document.body.innerText"))` ≈ 150–200. After the grid +renders it jumps to ~3000+. If a `js()`/screenshot call raises `TimeoutError`, the page +was mid-render — just retry the read in a **separate** call (the page keeps loading in +the background; do not restart the daemon for this). + +## Search results extraction — parse innerText, not selectors + +The DOM cards (`li.shopee-search-item-result__item`) render lazily and querySelector often +returns 0 right after load. The robust method is parsing `document.body.innerText`, where +each result is laid out as **separate lines** with the `$` on its own line: + +``` +<product name> +$ +12.90 +-15% (optional discount line) +4.9 (rating, optional) +2k+ sold (optional) +2 Days (delivery, optional) +SG +Find Similar +``` + +Parser (find the `$` line, take the line before as name, the line after as price): +```python +import re +txt = js("document.body.innerText") or "" +lines = [l.strip() for l in txt.split("\n") if l.strip()] +out, seen = [], set() +for i, l in enumerate(lines): + if l == "$" and i+1 < len(lines) and re.match(r"^[\d,]+\.\d{2}$", lines[i+1]): + name = lines[i-1] + if len(name) > 8 and "sold" not in name and name not in seen: + seen.add(name) + out.append(("$" + lines[i+1], name)) +``` + +Results start just after the line `Search result for '<keyword>'` and the `1/N` counter; +everything before that (trending keywords, filter labels) is chrome — the `len>8` and +`"sold" not in name` guards drop most of it. + +Selector fallback (only works once cards are in view — scroll first): +```python +js("document.querySelectorAll('li.shopee-search-item-result__item').length") +``` + +## Relevance filtering (Shopee search is noisy) + +A query like "frozen ramen" returns mostly **instant cup/packet noodles, soup-base +concentrates, and restaurant catering bundles** — not frozen retail packs. Filter by +keyword in the product name and sanity-check the price band. In this survey, genuine +frozen retail ramen/udon was scarce on Shopee; the real catalogue depth was on +Lazada/RedMart. Don't assume an empty/odd result set is a bug — Shopee genuinely may not +stock the SKU. + +## Daemon / IPC recovery + +Heavy Shopee pages occasionally throw `TimeoutError: timed out` from the IPC layer. +Escalate gently: +1. Retry the same read in a new `browser-harness` call (cheapest; usually enough). +2. `ensure_real_tab()` then retry, if the session looks detached. +3. `restart_daemon()` only as a last resort — on Chrome 144+ it re-triggers the + "Allow remote debugging?" popup, which the user must click again. Avoid mid-task. + +## Gotchas (field-tested) + +- **Logged-out = hard wall** at `/verify/traffic/error` (`type=4`). No bypass; ask the user to log in. +- **Search API → 403.** Browser DOM only. +- **`wait_for_load()` times out.** Use `wait(7-9)`. +- **`capture_screenshot()` (full page) times out.** Use `capture_screenshot(full=False)`. +- **`window.scrollTo` can hang.** Nudge with a CDP `End` key event, then `wait(3)`. +- **Prices are split text nodes.** In innerText the `$` and the number are on separate lines — parse accordingly; `querySelector('[class*=price]')` is brittle. +- **Cards render lazily.** querySelector count is 0 until the grid scrolls into view. +- **Reuse one tab.** `goto_url` for repeat searches; `close_tab()` when done. Never `new_tab` per query. +- **Noisy results.** Instant noodles / soup bases / catering dominate "frozen" queries — filter by name. From 9994b3563147cc8de942c706b0fcbe57b9299f2c Mon Sep 17 00:00:00 2001 From: Eclair Ji Li <eclairjili@ajisin.local> Date: Thu, 4 Jun 2026 13:29:54 +0800 Subject: [PATCH 2/2] Remove dead-code stub in laz_detail() The breadcrumb-scan loop body was just `pass` and did nothing; the real breadcrumb/title extraction happens lower in the function. No behavior change. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> --- agent-workspace/agent_helpers.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/agent-workspace/agent_helpers.py b/agent-workspace/agent_helpers.py index 414e4a04..e74d13ca 100644 --- a/agent-workspace/agent_helpers.py +++ b/agent-workspace/agent_helpers.py @@ -95,10 +95,6 @@ def laz_detail(): d = {"title": "", "brand": "", "pack_size": "", "origin": "", "ptype": "", "price": "", "soldby": "", "breadcrumb": ""} import re - # breadcrumb line contains tab-joined path ending in the title - for l in lines: - if "Groceries" in l and "\t" not in l and len(l) > 20: - pass # title: the heading repeats right after breadcrumb; take the longest # line that looks like a product title def nextval(i):