From 34f550b874d2f4b6c435fab8b81c9b19a43b6920 Mon Sep 17 00:00:00 2001 From: Yilin Jing Date: Fri, 27 Feb 2026 20:50:41 +0800 Subject: [PATCH 1/3] =?UTF-8?q?perf(#26):=20render=5Fjs=3Dfalse=20(5?= =?UTF-8?q?=E2=86=921=20credit),=20session=5Fid=20per=20category,=20ai=5Fs?= =?UTF-8?q?elector,=20usage=20monitoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- backend/internal/service/cron.go | 33 +++++++-- .../internal/service/kickstarter_scraping.go | 40 ++++++++--- .../internal/service/scrapingbee_client.go | 69 +++++++++++++++---- 3 files changed, 116 insertions(+), 26 deletions(-) diff --git a/backend/internal/service/cron.go b/backend/internal/service/cron.go index 022afa6..286e381 100644 --- a/backend/internal/service/cron.go +++ b/backend/internal/service/cron.go @@ -1,8 +1,10 @@ package service import ( + "context" "fmt" "log" + "math/rand" "os" "strconv" "time" @@ -48,6 +50,27 @@ func (s *CronService) Start() { }) s.scheduler.Start() log.Println("Cron scheduler started (02:00 UTC daily)") + + // Log credit balance at startup so we know headroom before the first crawl. + go func() { + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + usage, err := s.scrapingService.client.FetchUsage(ctx) + if err != nil { + log.Printf("ScrapingBee usage check failed: %v", err) + return + } + pct := 0.0 + if usage.MaxCredits > 0 { + pct = float64(usage.UsedCredits) / float64(usage.MaxCredits) * 100 + } + log.Printf("ScrapingBee usage: %d/%d credits (%.1f%%), renews %s", + usage.UsedCredits, usage.MaxCredits, pct, usage.RenewalDate) + if pct >= 80 { + log.Printf("WARNING: ScrapingBee credits above 80%% (%d/%d) — consider upgrading plan", + usage.UsedCredits, usage.MaxCredits) + } + }() } func (s *CronService) Stop() { @@ -97,11 +120,12 @@ func (s *CronService) RunCrawlNow() error { for _, sortCfg := range buildCrawlSorts() { for _, cat := range crawlCategories { - // sortCfg.pageDepth is env-configurable (can raise or lower). - // cat.PageDepth is only the default used when no env var is set. depth := sortCfg.pageDepth + // Assign a sticky session_id so all pages for this (sort, category) + // pass through the same proxy IP — less likely to trigger rate limits. + sessionID := rand.Intn(10_000_000) + 1 for page := 1; page <= depth; page++ { - campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page) + campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page, sessionID) if err != nil { log.Printf("Cron: ScrapingBee error sort=%s cat=%s page=%d: %v", sortCfg.sort, cat.ID, page, err) break @@ -172,8 +196,9 @@ func (s *CronService) RunBackfill() error { for _, sortCfg := range sorts { for _, cat := range crawlCategories { depth := sortCfg.depth + sessionID := rand.Intn(10_000_000) + 1 for page := 1; page <= depth; page++ { - campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page) + campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page, sessionID) if err != nil { log.Printf("Backfill: error sort=%s cat=%s page=%d: %v", sortCfg.sort, cat.ID, page, err) break diff --git a/backend/internal/service/kickstarter_scraping.go b/backend/internal/service/kickstarter_scraping.go index ca70b32..8dedb3d 100644 --- a/backend/internal/service/kickstarter_scraping.go +++ b/backend/internal/service/kickstarter_scraping.go @@ -25,7 +25,7 @@ func NewKickstarterScrapingService(apiKey string, maxConcurrent int) *Kickstarte } } -// Search searches for campaigns using AI extraction (10 credits per request) +// Search searches for campaigns using AI extraction (6 credits per request). func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor string, first int) (*SearchResult, error) { ctx := context.Background() @@ -40,10 +40,11 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin // Build Kickstarter discover URL with page discoverURL := s.buildDiscoverURL(term, categoryID, sort, page) - // Try AI extraction first + // Try AI extraction first; ai_selector focuses only on project cards, reducing processing time. aiQuery := "Extract all projects from this page. For each project return a JSON object with these fields: name, slug, creator_slug (the creator's URL slug, e.g. 'john-doe' from kickstarter.com/projects/john-doe/...), project_url (full canonical Kickstarter URL), goal, pledged, currency, deadline, creator, category, photo_url, blurb." + aiSelector := "[data-project]" - aiResult, err := s.client.ExtractWithAI(ctx, discoverURL, aiQuery) + aiResult, err := s.client.ExtractWithAI(ctx, discoverURL, aiQuery, aiSelector) if err == nil { campaigns, parseErr := s.parseAIResponse(aiResult) if parseErr == nil && len(campaigns) > 0 { @@ -69,7 +70,7 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin } // Fallback to HTML parsing - html, err := s.client.FetchHTML(ctx, discoverURL) + html, err := s.client.FetchHTMLInSession(ctx, discoverURL, 0) if err != nil { return nil, fmt.Errorf("fetch HTML: %w", err) } @@ -96,15 +97,14 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin }, nil } -// DiscoverCampaigns fetches campaigns for a specific category using HTML parsing (5 credits) -func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort string, page int) ([]model.Campaign, error) { +// DiscoverCampaigns fetches campaigns for a specific category using HTML parsing (1 credit). +// sessionID routes all pages for the same category through the same proxy IP (sticky session). +func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort string, page int, sessionID int) ([]model.Campaign, error) { ctx := context.Background() - // Build URL discoverURL := s.buildDiscoverURL("", categoryID, sort, page) - // Fetch HTML only (cheaper than AI extraction) - html, err := s.client.FetchHTML(ctx, discoverURL) + html, err := s.client.FetchHTMLInSession(ctx, discoverURL, sessionID) if err != nil { return nil, fmt.Errorf("fetch HTML: %w", err) } @@ -124,6 +124,28 @@ func (s *KickstarterScrapingService) FetchCategories() ([]model.Category, error) return kickstarterCategories, nil } +// LogUsage fetches and logs the current monthly credit consumption. +// Logs a WARNING if usage exceeds 80% of the monthly allowance. +func (s *KickstarterScrapingService) LogUsage() { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + usage, err := s.client.FetchUsage(ctx) + if err != nil { + log.Printf("ScrapingBee usage check failed: %v", err) + return + } + pct := 0.0 + if usage.MaxCredits > 0 { + pct = float64(usage.UsedCredits) / float64(usage.MaxCredits) * 100 + } + log.Printf("ScrapingBee usage: %d/%d credits (%.1f%%), renews %s", + usage.UsedCredits, usage.MaxCredits, pct, usage.RenewalDate) + if pct >= 80 { + log.Printf("WARNING: ScrapingBee credits above 80%% (%d/%d) — consider upgrading plan", + usage.UsedCredits, usage.MaxCredits) + } +} + func (s *KickstarterScrapingService) buildDiscoverURL(term, categoryID, sort string, page int) string { baseURL := "https://www.kickstarter.com/discover/advanced" diff --git a/backend/internal/service/scrapingbee_client.go b/backend/internal/service/scrapingbee_client.go index 00f833d..611c284 100644 --- a/backend/internal/service/scrapingbee_client.go +++ b/backend/internal/service/scrapingbee_client.go @@ -2,16 +2,27 @@ package service import ( "context" + "encoding/json" "fmt" "io" "log" "net/http" "net/url" + "strconv" "time" ) const scrapingBeeBaseURL = "https://app.scrapingbee.com/api/v1" +// UsageResult holds the response from /api/v1/usage. +type UsageResult struct { + MaxCredits int `json:"max_api_credit"` + UsedCredits int `json:"used_api_credit"` + MaxConcurrency int `json:"max_concurrency"` + CurrentConcurrency int `json:"current_concurrency"` + RenewalDate string `json:"renewal_subscription_date"` +} + type ScrapingBeeClient struct { apiKey string baseURL string @@ -56,36 +67,71 @@ func NewScrapingBeeClient(apiKey string, maxConcurrent int) *ScrapingBeeClient { } } -// FetchHTML fetches raw HTML from a URL using ScrapingBee (5 credits) +// FetchHTML fetches raw HTML without JS rendering (1 credit). +// Kickstarter's discover pages are server-side rendered so JS is not needed. func (c *ScrapingBeeClient) FetchHTML(ctx context.Context, targetURL string) (string, error) { - return c.doRequest(ctx, targetURL, false, "") + return c.doRequest(ctx, targetURL, false, "", "", 0) +} + +// FetchHTMLInSession fetches raw HTML using a sticky session_id so all requests +// for the same crawl pass share the same proxy IP (1 credit). +func (c *ScrapingBeeClient) FetchHTMLInSession(ctx context.Context, targetURL string, sessionID int) (string, error) { + return c.doRequest(ctx, targetURL, false, "", "", sessionID) } -// ExtractWithAI fetches and extracts data using AI (10 credits) -func (c *ScrapingBeeClient) ExtractWithAI(ctx context.Context, targetURL string, query string) (string, error) { - return c.doRequest(ctx, targetURL, true, query) +// ExtractWithAI fetches and extracts data using AI (6 credits: 1 base + 5 AI). +// aiSelector narrows the AI's focus to a CSS selector, speeding up extraction. +func (c *ScrapingBeeClient) ExtractWithAI(ctx context.Context, targetURL, query, aiSelector string) (string, error) { + return c.doRequest(ctx, targetURL, true, query, aiSelector, 0) } -func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery string) (string, error) { - // Rate limiting +// FetchUsage returns the current monthly credit consumption (not rate-limited). +func (c *ScrapingBeeClient) FetchUsage(ctx context.Context) (*UsageResult, error) { + params := url.Values{} + params.Set("api_key", c.apiKey) + reqURL := fmt.Sprintf("%s/usage?%s", c.baseURL, params.Encode()) + + req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) + if err != nil { + return nil, err + } + resp, err := c.httpClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + var usage UsageResult + if err := json.NewDecoder(resp.Body).Decode(&usage); err != nil { + return nil, err + } + return &usage, nil +} + +func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery, aiSelector string, sessionID int) (string, error) { if err := c.rateLimiter.Acquire(ctx); err != nil { return "", fmt.Errorf("rate limiter: %w", err) } defer c.rateLimiter.Release() - // Build ScrapingBee API URL params := url.Values{} params.Set("api_key", c.apiKey) params.Set("url", targetURL) - params.Set("render_js", "true") + // Kickstarter discover pages are SSR — no headless browser needed (1 credit vs 5) + params.Set("render_js", "false") if useAI && aiQuery != "" { params.Set("ai_query", aiQuery) } + if useAI && aiSelector != "" { + params.Set("ai_selector", aiSelector) + } + if sessionID > 0 { + params.Set("session_id", strconv.Itoa(sessionID)) + } reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode()) - // Retry logic var lastErr error for attempt := 0; attempt < 3; attempt++ { if attempt > 0 { @@ -110,7 +156,6 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use } defer resp.Body.Close() - // Check for rate limiting or server errors if resp.StatusCode == 429 { lastErr = fmt.Errorf("rate limited (429)") continue @@ -124,13 +169,11 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use return "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) } - // Read response body, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("read response: %w", err) } - // Log success credits := resp.Header.Get("Spb-Cost") log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v", targetURL, credits, useAI) From fb967737a9688cc56a0e5fd386f387b2be2aa4de Mon Sep 17 00:00:00 2001 From: Yilin Jing Date: Fri, 27 Feb 2026 20:56:10 +0800 Subject: [PATCH 2/3] feat(#26): add timeout=30000, forward_headers, premium_proxy auto-escalation --- .../internal/service/scrapingbee_client.go | 67 ++++++++++++------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/backend/internal/service/scrapingbee_client.go b/backend/internal/service/scrapingbee_client.go index 611c284..1ffba73 100644 --- a/backend/internal/service/scrapingbee_client.go +++ b/backend/internal/service/scrapingbee_client.go @@ -60,9 +60,10 @@ func (rl *RateLimiter) Release() { func NewScrapingBeeClient(apiKey string, maxConcurrent int) *ScrapingBeeClient { return &ScrapingBeeClient{ - apiKey: apiKey, - baseURL: scrapingBeeBaseURL, - httpClient: &http.Client{Timeout: 60 * time.Second}, + apiKey: apiKey, + baseURL: scrapingBeeBaseURL, + // ScrapingBee timeout param is 30s; add 5s margin for network round-trip. + httpClient: &http.Client{Timeout: 35 * time.Second}, rateLimiter: NewRateLimiter(maxConcurrent, 500*time.Millisecond), } } @@ -114,29 +115,46 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use } defer c.rateLimiter.Release() - params := url.Values{} - params.Set("api_key", c.apiKey) - params.Set("url", targetURL) - // Kickstarter discover pages are SSR — no headless browser needed (1 credit vs 5) - params.Set("render_js", "false") - - if useAI && aiQuery != "" { - params.Set("ai_query", aiQuery) - } - if useAI && aiSelector != "" { - params.Set("ai_selector", aiSelector) - } - if sessionID > 0 { - params.Set("session_id", strconv.Itoa(sessionID)) + // buildParams constructs the query string, optionally upgrading to premium proxy. + buildParams := func(premiumProxy bool) string { + params := url.Values{} + params.Set("api_key", c.apiKey) + params.Set("url", targetURL) + // Kickstarter discover pages are SSR — render_js=false costs 1 credit (vs 5). + params.Set("render_js", "false") + // Fail fast: 30s is more than enough for an SSR page; default is 140s. + params.Set("timeout", "30000") + // Forward Accept-Language so the request looks like real browser traffic. + params.Set("forward_headers", "true") + if premiumProxy { + // Residential premium proxy (10 credits) as fallback when standard blocked. + params.Set("premium_proxy", "true") + } + if useAI && aiQuery != "" { + params.Set("ai_query", aiQuery) + } + if useAI && aiSelector != "" { + params.Set("ai_selector", aiSelector) + } + if sessionID > 0 { + params.Set("session_id", strconv.Itoa(sessionID)) + } + return fmt.Sprintf("%s?%s", c.baseURL, params.Encode()) } - reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode()) - var lastErr error - for attempt := 0; attempt < 3; attempt++ { + premiumProxy := false + + for attempt := 0; attempt < 4; attempt++ { if attempt > 0 { backoff := time.Duration(attempt) * 2 * time.Second - log.Printf("ScrapingBee retry attempt %d after %v", attempt+1, backoff) + // On the 3rd retry, escalate to premium_proxy (residential IP). + if attempt == 3 && !premiumProxy { + premiumProxy = true + log.Printf("ScrapingBee escalating to premium_proxy for %s", targetURL) + } else { + log.Printf("ScrapingBee retry attempt %d after %v", attempt, backoff) + } select { case <-time.After(backoff): case <-ctx.Done(): @@ -144,10 +162,13 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use } } + reqURL := buildParams(premiumProxy) req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) if err != nil { return "", fmt.Errorf("create request: %w", err) } + // Forward a realistic Accept-Language header to Kickstarter. + req.Header.Set("Spb-Accept-Language", "en-US,en;q=0.9") resp, err := c.httpClient.Do(req) if err != nil { @@ -175,10 +196,10 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use } credits := resp.Header.Get("Spb-Cost") - log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v", targetURL, credits, useAI) + log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v, premium=%v", targetURL, credits, useAI, premiumProxy) return string(body), nil } - return "", fmt.Errorf("failed after 3 attempts: %w", lastErr) + return "", fmt.Errorf("failed after 4 attempts: %w", lastErr) } From 4435228503bef0a6435ae8ab52844409c8321eff Mon Sep 17 00:00:00 2001 From: Yilin Jing Date: Fri, 27 Feb 2026 20:59:32 +0800 Subject: [PATCH 3/3] test(#26): add integration tests for ScrapingBee client; fix Search() to use HTML parsing (remove broken AI path) --- .../internal/service/kickstarter_scraping.go | 38 +----- .../scrapingbee_client_integration_test.go | 123 ++++++++++++++++++ 2 files changed, 127 insertions(+), 34 deletions(-) create mode 100644 backend/internal/service/scrapingbee_client_integration_test.go diff --git a/backend/internal/service/kickstarter_scraping.go b/backend/internal/service/kickstarter_scraping.go index 8dedb3d..4f3fae7 100644 --- a/backend/internal/service/kickstarter_scraping.go +++ b/backend/internal/service/kickstarter_scraping.go @@ -25,7 +25,9 @@ func NewKickstarterScrapingService(apiKey string, maxConcurrent int) *Kickstarte } } -// Search searches for campaigns using AI extraction (6 credits per request). +// Search searches for campaigns using HTML parsing (1 credit per request). +// Note: AI extraction was removed — Kickstarter embeds project data in [data-project] +// HTML attributes, not text nodes, so ScrapingBee AI returns EMPTY_RESPONSE for that selector. func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor string, first int) (*SearchResult, error) { ctx := context.Background() @@ -37,39 +39,8 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin } } - // Build Kickstarter discover URL with page discoverURL := s.buildDiscoverURL(term, categoryID, sort, page) - // Try AI extraction first; ai_selector focuses only on project cards, reducing processing time. - aiQuery := "Extract all projects from this page. For each project return a JSON object with these fields: name, slug, creator_slug (the creator's URL slug, e.g. 'john-doe' from kickstarter.com/projects/john-doe/...), project_url (full canonical Kickstarter URL), goal, pledged, currency, deadline, creator, category, photo_url, blurb." - aiSelector := "[data-project]" - - aiResult, err := s.client.ExtractWithAI(ctx, discoverURL, aiQuery, aiSelector) - if err == nil { - campaigns, parseErr := s.parseAIResponse(aiResult) - if parseErr == nil && len(campaigns) > 0 { - log.Printf("AI extraction successful: found %d campaigns (page %d)", len(campaigns), page) - - // Generate next cursor if we got a full page - nextCursor := "" - hasNextPage := len(campaigns) >= first - if hasNextPage { - nextCursor = fmt.Sprintf("page:%d", page+1) - } - - return &SearchResult{ - Campaigns: campaigns, - TotalCount: len(campaigns), - NextCursor: nextCursor, - HasNextPage: hasNextPage, - }, nil - } - log.Printf("AI extraction parse failed: %v, falling back to HTML", parseErr) - } else { - log.Printf("AI extraction failed: %v, falling back to HTML", err) - } - - // Fallback to HTML parsing html, err := s.client.FetchHTMLInSession(ctx, discoverURL, 0) if err != nil { return nil, fmt.Errorf("fetch HTML: %w", err) @@ -80,9 +51,8 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin return nil, fmt.Errorf("parse HTML: %w", err) } - log.Printf("HTML parsing successful: found %d campaigns (page %d)", len(campaigns), page) + log.Printf("Search: found %d campaigns for term=%q page=%d", len(campaigns), term, page) - // Generate next cursor if we got a full page nextCursor := "" hasNextPage := len(campaigns) >= first if hasNextPage { diff --git a/backend/internal/service/scrapingbee_client_integration_test.go b/backend/internal/service/scrapingbee_client_integration_test.go new file mode 100644 index 0000000..29d9d63 --- /dev/null +++ b/backend/internal/service/scrapingbee_client_integration_test.go @@ -0,0 +1,123 @@ +//go:build integration + +package service + +import ( + "context" + "os" + "strings" + "testing" +) + +// Run with: +// SCRAPINGBEE_API_KEY= go test -v -tags integration -run TestScrapingBee ./internal/service/ +// SCRAPINGBEE_API_KEY= go test -v -tags integration -timeout 120s ./internal/service/ + +func testClient(t *testing.T) *ScrapingBeeClient { + t.Helper() + key := os.Getenv("SCRAPINGBEE_API_KEY") + if key == "" { + t.Fatal("SCRAPINGBEE_API_KEY env var not set") + } + return NewScrapingBeeClient(key, 2) +} + +func TestScrapingBee_FetchUsage(t *testing.T) { + client := testClient(t) + usage, err := client.FetchUsage(context.Background()) + if err != nil { + t.Fatalf("FetchUsage error: %v", err) + } + t.Logf("Usage: %d/%d credits (%.1f%%), renews %s", + usage.UsedCredits, usage.MaxCredits, + float64(usage.UsedCredits)/float64(usage.MaxCredits)*100, + usage.RenewalDate, + ) + if usage.MaxCredits <= 0 { + t.Errorf("expected positive MaxCredits, got %d", usage.MaxCredits) + } +} + +// TestScrapingBee_FetchHTMLInSession costs 1 credit. +func TestScrapingBee_FetchHTMLInSession(t *testing.T) { + client := testClient(t) + targetURL := "https://www.kickstarter.com/discover/advanced?category_id=16&sort=magic" + html, err := client.FetchHTMLInSession(context.Background(), targetURL, 42) + if err != nil { + t.Fatalf("FetchHTMLInSession error: %v", err) + } + if len(html) < 1000 { + t.Errorf("HTML response suspiciously short: %d bytes", len(html)) + } + if !strings.Contains(html, "kickstarter") { + t.Errorf("response does not look like a Kickstarter page") + } + t.Logf("FetchHTMLInSession: got %d bytes", len(html)) +} + +// TestScrapingBee_DiscoverCampaigns costs 1 credit and validates HTML parsing. +func TestScrapingBee_DiscoverCampaigns(t *testing.T) { + key := os.Getenv("SCRAPINGBEE_API_KEY") + if key == "" { + t.Fatal("SCRAPINGBEE_API_KEY env var not set") + } + svc := NewKickstarterScrapingService(key, 2) + // category_id=16 = Technology; sort=magic; page=1 + campaigns, err := svc.DiscoverCampaigns("16", "magic", 1, 42) + if err != nil { + t.Fatalf("DiscoverCampaigns error: %v", err) + } + if len(campaigns) == 0 { + t.Fatal("expected at least one campaign, got 0") + } + t.Logf("DiscoverCampaigns: got %d campaigns", len(campaigns)) + for i, c := range campaigns { + if c.PID == "" { + t.Errorf("campaign[%d] has empty PID", i) + } + if c.ProjectURL == "" { + t.Errorf("campaign[%d] has empty ProjectURL", i) + } + t.Logf(" [%d] %s — %s", i, c.PID, c.Name) + } +} + +// TestScrapingBee_Search costs 1 credit and validates the Search() method. +func TestScrapingBee_Search(t *testing.T) { + key := os.Getenv("SCRAPINGBEE_API_KEY") + if key == "" { + t.Fatal("SCRAPINGBEE_API_KEY env var not set") + } + svc := NewKickstarterScrapingService(key, 2) + result, err := svc.Search("keyboard", "16", "magic", "", 12) + if err != nil { + t.Fatalf("Search error: %v", err) + } + if len(result.Campaigns) == 0 { + t.Fatal("expected at least one campaign, got 0") + } + t.Logf("Search: got %d campaigns, hasNextPage=%v", len(result.Campaigns), result.HasNextPage) + for i, c := range result.Campaigns { + t.Logf(" [%d] %s — %s", i, c.PID, c.Name) + } +} + +// TestScrapingBee_ExtractWithAI_RawAPI documents that AI extraction returns +// EMPTY_RESPONSE for [data-project] because project data is stored in HTML +// attributes (not text nodes). This test is informational — it verifies the +// raw API call succeeds and logs what ScrapingBee actually returns. +func TestScrapingBee_ExtractWithAI_RawAPI(t *testing.T) { + client := testClient(t) + targetURL := "https://www.kickstarter.com/discover/advanced?category_id=16&sort=magic" + // Note: [data-project] holds JSON in an attribute, not text — AI sees no text to extract. + result, err := client.ExtractWithAI(context.Background(), targetURL, + "Extract all project names visible on the page.", "h2 a") + if err != nil { + t.Fatalf("ExtractWithAI error: %v", err) + } + preview := result + if len(preview) > 300 { + preview = preview[:300] + } + t.Logf("ExtractWithAI (h2 a selector) result (%d bytes): %s", len(result), preview) +}