Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions backend/internal/service/kickstarter_scraping.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort s
return nil, fmt.Errorf("parse HTML: %w", err)
}

if len(campaigns) == 0 && len(html) > 50_000 {
log.Printf("DiscoverCampaigns: 0 campaigns from %d-byte page (cat=%s page=%d) — possible HTML structure change", len(html), categoryID, page)
}
log.Printf("Discovered %d campaigns for category %s (page %d)", len(campaigns), categoryID, page)

return campaigns, nil
Expand Down
24 changes: 22 additions & 2 deletions backend/internal/service/scrapingbee_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"net/http"
"net/url"
"strconv"
"strings"
"time"
)

Expand Down Expand Up @@ -109,6 +110,15 @@ func (c *ScrapingBeeClient) FetchUsage(ctx context.Context) (*UsageResult, error
return &usage, nil
}

// isCFChallengePage returns true when the response is a Cloudflare interstitial
// challenge rather than real page content. CF challenge pages are small (<200KB)
// and always contain the challenge-platform main.js script. Real Kickstarter
// discover pages are 500KB+ and do not require client-side JS challenge resolution.
func isCFChallengePage(html string) bool {
const cfMarker = "challenge-platform/scripts/jsd/main.js"
return strings.Contains(html, cfMarker) && len(html) < 200_000
}

func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery, aiSelector string, sessionID int) (string, error) {
if err := c.rateLimiter.Acquire(ctx); err != nil {
return "", fmt.Errorf("rate limiter: %w", err)
Expand Down Expand Up @@ -195,10 +205,20 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use
return "", fmt.Errorf("read response: %w", err)
}

// Detect Cloudflare JS challenge pages (HTTP 200 but no real content).
// CF challenge pages are small and contain the challenge-platform script.
// Real Kickstarter pages are 500KB+ and contain project data.
bodyStr := string(body)
if isCFChallengePage(bodyStr) {
lastErr = fmt.Errorf("cloudflare challenge page detected (size=%d)", len(body))
log.Printf("ScrapingBee: CF challenge at %s (size=%d, premium=%v), retrying", targetURL, len(body), premiumProxy)
continue
}

credits := resp.Header.Get("Spb-Cost")
log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v, premium=%v", targetURL, credits, useAI, premiumProxy)
log.Printf("ScrapingBee success: url=%s, credits=%s, size=%d, useAI=%v, premium=%v", targetURL, credits, len(body), useAI, premiumProxy)

return string(body), nil
return bodyStr, nil
}

return "", fmt.Errorf("failed after 4 attempts: %w", lastErr)
Expand Down
51 changes: 51 additions & 0 deletions backend/internal/service/scrapingbee_client_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package service

import (
"strings"
"testing"
)

func TestIsCFChallengePage(t *testing.T) {
cfMarker := "challenge-platform/scripts/jsd/main.js"

tests := []struct {
name string
html string
expected bool
}{
{
name: "pure CF challenge page (small + marker)",
html: strings.Repeat("x", 10_000) + cfMarker,
expected: true,
},
{
name: "real page with CF bot-management script injected (large)",
html: strings.Repeat("x", 250_000) + cfMarker,
expected: false,
},
{
name: "normal page without CF marker",
html: strings.Repeat("x", 800_000),
expected: false,
},
{
name: "empty page",
html: "",
expected: false,
},
{
name: "small page without CF marker",
html: "<html><body>Not found</body></html>",
expected: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := isCFChallengePage(tt.html)
if got != tt.expected {
t.Errorf("isCFChallengePage() = %v, want %v (html size=%d)", got, tt.expected, len(tt.html))
}
})
}
}