diff --git a/backend/internal/service/kickstarter_scraping.go b/backend/internal/service/kickstarter_scraping.go index 4f3fae7..6ff47c1 100644 --- a/backend/internal/service/kickstarter_scraping.go +++ b/backend/internal/service/kickstarter_scraping.go @@ -84,6 +84,9 @@ func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort s return nil, fmt.Errorf("parse HTML: %w", err) } + if len(campaigns) == 0 && len(html) > 50_000 { + log.Printf("DiscoverCampaigns: 0 campaigns from %d-byte page (cat=%s page=%d) — possible HTML structure change", len(html), categoryID, page) + } log.Printf("Discovered %d campaigns for category %s (page %d)", len(campaigns), categoryID, page) return campaigns, nil diff --git a/backend/internal/service/scrapingbee_client.go b/backend/internal/service/scrapingbee_client.go index 1ffba73..6200dbc 100644 --- a/backend/internal/service/scrapingbee_client.go +++ b/backend/internal/service/scrapingbee_client.go @@ -9,6 +9,7 @@ import ( "net/http" "net/url" "strconv" + "strings" "time" ) @@ -109,6 +110,15 @@ func (c *ScrapingBeeClient) FetchUsage(ctx context.Context) (*UsageResult, error return &usage, nil } +// isCFChallengePage returns true when the response is a Cloudflare interstitial +// challenge rather than real page content. CF challenge pages are small (<200KB) +// and always contain the challenge-platform main.js script. Real Kickstarter +// discover pages are 500KB+ and do not require client-side JS challenge resolution. +func isCFChallengePage(html string) bool { + const cfMarker = "challenge-platform/scripts/jsd/main.js" + return strings.Contains(html, cfMarker) && len(html) < 200_000 +} + func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery, aiSelector string, sessionID int) (string, error) { if err := c.rateLimiter.Acquire(ctx); err != nil { return "", fmt.Errorf("rate limiter: %w", err) @@ -195,10 +205,20 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use return "", fmt.Errorf("read response: %w", err) } + // Detect Cloudflare JS challenge pages (HTTP 200 but no real content). + // CF challenge pages are small and contain the challenge-platform script. + // Real Kickstarter pages are 500KB+ and contain project data. + bodyStr := string(body) + if isCFChallengePage(bodyStr) { + lastErr = fmt.Errorf("cloudflare challenge page detected (size=%d)", len(body)) + log.Printf("ScrapingBee: CF challenge at %s (size=%d, premium=%v), retrying", targetURL, len(body), premiumProxy) + continue + } + credits := resp.Header.Get("Spb-Cost") - log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v, premium=%v", targetURL, credits, useAI, premiumProxy) + log.Printf("ScrapingBee success: url=%s, credits=%s, size=%d, useAI=%v, premium=%v", targetURL, credits, len(body), useAI, premiumProxy) - return string(body), nil + return bodyStr, nil } return "", fmt.Errorf("failed after 4 attempts: %w", lastErr) diff --git a/backend/internal/service/scrapingbee_client_test.go b/backend/internal/service/scrapingbee_client_test.go new file mode 100644 index 0000000..b63eb65 --- /dev/null +++ b/backend/internal/service/scrapingbee_client_test.go @@ -0,0 +1,51 @@ +package service + +import ( + "strings" + "testing" +) + +func TestIsCFChallengePage(t *testing.T) { + cfMarker := "challenge-platform/scripts/jsd/main.js" + + tests := []struct { + name string + html string + expected bool + }{ + { + name: "pure CF challenge page (small + marker)", + html: strings.Repeat("x", 10_000) + cfMarker, + expected: true, + }, + { + name: "real page with CF bot-management script injected (large)", + html: strings.Repeat("x", 250_000) + cfMarker, + expected: false, + }, + { + name: "normal page without CF marker", + html: strings.Repeat("x", 800_000), + expected: false, + }, + { + name: "empty page", + html: "", + expected: false, + }, + { + name: "small page without CF marker", + html: "Not found", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isCFChallengePage(tt.html) + if got != tt.expected { + t.Errorf("isCFChallengePage() = %v, want %v (html size=%d)", got, tt.expected, len(tt.html)) + } + }) + } +}