Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions backend/internal/service/cron.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package service

import (
"context"
"fmt"
"log"
"math/rand"
"os"
"strconv"
"time"
Expand Down Expand Up @@ -48,6 +50,27 @@ func (s *CronService) Start() {
})
s.scheduler.Start()
log.Println("Cron scheduler started (02:00 UTC daily)")

// Log credit balance at startup so we know headroom before the first crawl.
go func() {
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()
usage, err := s.scrapingService.client.FetchUsage(ctx)
if err != nil {
log.Printf("ScrapingBee usage check failed: %v", err)
return
}
pct := 0.0
if usage.MaxCredits > 0 {
pct = float64(usage.UsedCredits) / float64(usage.MaxCredits) * 100
}
log.Printf("ScrapingBee usage: %d/%d credits (%.1f%%), renews %s",
usage.UsedCredits, usage.MaxCredits, pct, usage.RenewalDate)
if pct >= 80 {
log.Printf("WARNING: ScrapingBee credits above 80%% (%d/%d) — consider upgrading plan",
usage.UsedCredits, usage.MaxCredits)
}
}()
}

func (s *CronService) Stop() {
Expand Down Expand Up @@ -97,11 +120,12 @@ func (s *CronService) RunCrawlNow() error {

for _, sortCfg := range buildCrawlSorts() {
for _, cat := range crawlCategories {
// sortCfg.pageDepth is env-configurable (can raise or lower).
// cat.PageDepth is only the default used when no env var is set.
depth := sortCfg.pageDepth
// Assign a sticky session_id so all pages for this (sort, category)
// pass through the same proxy IP — less likely to trigger rate limits.
sessionID := rand.Intn(10_000_000) + 1
for page := 1; page <= depth; page++ {
campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page)
campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page, sessionID)
if err != nil {
log.Printf("Cron: ScrapingBee error sort=%s cat=%s page=%d: %v", sortCfg.sort, cat.ID, page, err)
break
Expand Down Expand Up @@ -172,8 +196,9 @@ func (s *CronService) RunBackfill() error {
for _, sortCfg := range sorts {
for _, cat := range crawlCategories {
depth := sortCfg.depth
sessionID := rand.Intn(10_000_000) + 1
for page := 1; page <= depth; page++ {
campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page)
campaigns, err := s.scrapingService.DiscoverCampaigns(cat.ID, sortCfg.sort, page, sessionID)
if err != nil {
log.Printf("Backfill: error sort=%s cat=%s page=%d: %v", sortCfg.sort, cat.ID, page, err)
break
Expand Down
70 changes: 31 additions & 39 deletions backend/internal/service/kickstarter_scraping.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ func NewKickstarterScrapingService(apiKey string, maxConcurrent int) *Kickstarte
}
}

// Search searches for campaigns using AI extraction (10 credits per request)
// Search searches for campaigns using HTML parsing (1 credit per request).
// Note: AI extraction was removed — Kickstarter embeds project data in [data-project]
// HTML attributes, not text nodes, so ScrapingBee AI returns EMPTY_RESPONSE for that selector.
func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor string, first int) (*SearchResult, error) {
ctx := context.Background()

Expand All @@ -37,39 +39,9 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin
}
}

// Build Kickstarter discover URL with page
discoverURL := s.buildDiscoverURL(term, categoryID, sort, page)

// Try AI extraction first
aiQuery := "Extract all projects from this page. For each project return a JSON object with these fields: name, slug, creator_slug (the creator's URL slug, e.g. 'john-doe' from kickstarter.com/projects/john-doe/...), project_url (full canonical Kickstarter URL), goal, pledged, currency, deadline, creator, category, photo_url, blurb."

aiResult, err := s.client.ExtractWithAI(ctx, discoverURL, aiQuery)
if err == nil {
campaigns, parseErr := s.parseAIResponse(aiResult)
if parseErr == nil && len(campaigns) > 0 {
log.Printf("AI extraction successful: found %d campaigns (page %d)", len(campaigns), page)

// Generate next cursor if we got a full page
nextCursor := ""
hasNextPage := len(campaigns) >= first
if hasNextPage {
nextCursor = fmt.Sprintf("page:%d", page+1)
}

return &SearchResult{
Campaigns: campaigns,
TotalCount: len(campaigns),
NextCursor: nextCursor,
HasNextPage: hasNextPage,
}, nil
}
log.Printf("AI extraction parse failed: %v, falling back to HTML", parseErr)
} else {
log.Printf("AI extraction failed: %v, falling back to HTML", err)
}

// Fallback to HTML parsing
html, err := s.client.FetchHTML(ctx, discoverURL)
html, err := s.client.FetchHTMLInSession(ctx, discoverURL, 0)
if err != nil {
return nil, fmt.Errorf("fetch HTML: %w", err)
}
Expand All @@ -79,9 +51,8 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin
return nil, fmt.Errorf("parse HTML: %w", err)
}

log.Printf("HTML parsing successful: found %d campaigns (page %d)", len(campaigns), page)
log.Printf("Search: found %d campaigns for term=%q page=%d", len(campaigns), term, page)

// Generate next cursor if we got a full page
nextCursor := ""
hasNextPage := len(campaigns) >= first
if hasNextPage {
Expand All @@ -96,15 +67,14 @@ func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor strin
}, nil
}

// DiscoverCampaigns fetches campaigns for a specific category using HTML parsing (5 credits)
func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort string, page int) ([]model.Campaign, error) {
// DiscoverCampaigns fetches campaigns for a specific category using HTML parsing (1 credit).
// sessionID routes all pages for the same category through the same proxy IP (sticky session).
func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort string, page int, sessionID int) ([]model.Campaign, error) {
ctx := context.Background()

// Build URL
discoverURL := s.buildDiscoverURL("", categoryID, sort, page)

// Fetch HTML only (cheaper than AI extraction)
html, err := s.client.FetchHTML(ctx, discoverURL)
html, err := s.client.FetchHTMLInSession(ctx, discoverURL, sessionID)
if err != nil {
return nil, fmt.Errorf("fetch HTML: %w", err)
}
Expand All @@ -124,6 +94,28 @@ func (s *KickstarterScrapingService) FetchCategories() ([]model.Category, error)
return kickstarterCategories, nil
}

// LogUsage fetches and logs the current monthly credit consumption.
// Logs a WARNING if usage exceeds 80% of the monthly allowance.
func (s *KickstarterScrapingService) LogUsage() {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
usage, err := s.client.FetchUsage(ctx)
if err != nil {
log.Printf("ScrapingBee usage check failed: %v", err)
return
}
pct := 0.0
if usage.MaxCredits > 0 {
pct = float64(usage.UsedCredits) / float64(usage.MaxCredits) * 100
}
log.Printf("ScrapingBee usage: %d/%d credits (%.1f%%), renews %s",
usage.UsedCredits, usage.MaxCredits, pct, usage.RenewalDate)
if pct >= 80 {
log.Printf("WARNING: ScrapingBee credits above 80%% (%d/%d) — consider upgrading plan",
usage.UsedCredits, usage.MaxCredits)
}
}

func (s *KickstarterScrapingService) buildDiscoverURL(term, categoryID, sort string, page int) string {
baseURL := "https://www.kickstarter.com/discover/advanced"

Expand Down
120 changes: 92 additions & 28 deletions backend/internal/service/scrapingbee_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,27 @@ package service

import (
"context"
"encoding/json"
"fmt"
"io"
"log"
"net/http"
"net/url"
"strconv"
"time"
)

const scrapingBeeBaseURL = "https://app.scrapingbee.com/api/v1"

// UsageResult holds the response from /api/v1/usage.
type UsageResult struct {
MaxCredits int `json:"max_api_credit"`
UsedCredits int `json:"used_api_credit"`
MaxConcurrency int `json:"max_concurrency"`
CurrentConcurrency int `json:"current_concurrency"`
RenewalDate string `json:"renewal_subscription_date"`
}

type ScrapingBeeClient struct {
apiKey string
baseURL string
Expand Down Expand Up @@ -49,59 +60,115 @@ func (rl *RateLimiter) Release() {

func NewScrapingBeeClient(apiKey string, maxConcurrent int) *ScrapingBeeClient {
return &ScrapingBeeClient{
apiKey: apiKey,
baseURL: scrapingBeeBaseURL,
httpClient: &http.Client{Timeout: 60 * time.Second},
apiKey: apiKey,
baseURL: scrapingBeeBaseURL,
// ScrapingBee timeout param is 30s; add 5s margin for network round-trip.
httpClient: &http.Client{Timeout: 35 * time.Second},
rateLimiter: NewRateLimiter(maxConcurrent, 500*time.Millisecond),
}
}

// FetchHTML fetches raw HTML from a URL using ScrapingBee (5 credits)
// FetchHTML fetches raw HTML without JS rendering (1 credit).
// Kickstarter's discover pages are server-side rendered so JS is not needed.
func (c *ScrapingBeeClient) FetchHTML(ctx context.Context, targetURL string) (string, error) {
return c.doRequest(ctx, targetURL, false, "")
return c.doRequest(ctx, targetURL, false, "", "", 0)
}

// ExtractWithAI fetches and extracts data using AI (10 credits)
func (c *ScrapingBeeClient) ExtractWithAI(ctx context.Context, targetURL string, query string) (string, error) {
return c.doRequest(ctx, targetURL, true, query)
// FetchHTMLInSession fetches raw HTML using a sticky session_id so all requests
// for the same crawl pass share the same proxy IP (1 credit).
func (c *ScrapingBeeClient) FetchHTMLInSession(ctx context.Context, targetURL string, sessionID int) (string, error) {
return c.doRequest(ctx, targetURL, false, "", "", sessionID)
}

func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery string) (string, error) {
// Rate limiting
if err := c.rateLimiter.Acquire(ctx); err != nil {
return "", fmt.Errorf("rate limiter: %w", err)
}
defer c.rateLimiter.Release()
// ExtractWithAI fetches and extracts data using AI (6 credits: 1 base + 5 AI).
// aiSelector narrows the AI's focus to a CSS selector, speeding up extraction.
func (c *ScrapingBeeClient) ExtractWithAI(ctx context.Context, targetURL, query, aiSelector string) (string, error) {
return c.doRequest(ctx, targetURL, true, query, aiSelector, 0)
}

// Build ScrapingBee API URL
// FetchUsage returns the current monthly credit consumption (not rate-limited).
func (c *ScrapingBeeClient) FetchUsage(ctx context.Context) (*UsageResult, error) {
params := url.Values{}
params.Set("api_key", c.apiKey)
params.Set("url", targetURL)
params.Set("render_js", "true")
reqURL := fmt.Sprintf("%s/usage?%s", c.baseURL, params.Encode())

if useAI && aiQuery != "" {
params.Set("ai_query", aiQuery)
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
return nil, err
}
resp, err := c.httpClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()

reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode())
var usage UsageResult
if err := json.NewDecoder(resp.Body).Decode(&usage); err != nil {
return nil, err
}
return &usage, nil
}

func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery, aiSelector string, sessionID int) (string, error) {
if err := c.rateLimiter.Acquire(ctx); err != nil {
return "", fmt.Errorf("rate limiter: %w", err)
}
defer c.rateLimiter.Release()

// buildParams constructs the query string, optionally upgrading to premium proxy.
buildParams := func(premiumProxy bool) string {
params := url.Values{}
params.Set("api_key", c.apiKey)
params.Set("url", targetURL)
// Kickstarter discover pages are SSR — render_js=false costs 1 credit (vs 5).
params.Set("render_js", "false")
// Fail fast: 30s is more than enough for an SSR page; default is 140s.
params.Set("timeout", "30000")
// Forward Accept-Language so the request looks like real browser traffic.
params.Set("forward_headers", "true")
if premiumProxy {
// Residential premium proxy (10 credits) as fallback when standard blocked.
params.Set("premium_proxy", "true")
}
if useAI && aiQuery != "" {
params.Set("ai_query", aiQuery)
}
if useAI && aiSelector != "" {
params.Set("ai_selector", aiSelector)
}
if sessionID > 0 {
params.Set("session_id", strconv.Itoa(sessionID))
}
return fmt.Sprintf("%s?%s", c.baseURL, params.Encode())
}

// Retry logic
var lastErr error
for attempt := 0; attempt < 3; attempt++ {
premiumProxy := false

for attempt := 0; attempt < 4; attempt++ {
if attempt > 0 {
backoff := time.Duration(attempt) * 2 * time.Second
log.Printf("ScrapingBee retry attempt %d after %v", attempt+1, backoff)
// On the 3rd retry, escalate to premium_proxy (residential IP).
if attempt == 3 && !premiumProxy {
premiumProxy = true
log.Printf("ScrapingBee escalating to premium_proxy for %s", targetURL)
} else {
log.Printf("ScrapingBee retry attempt %d after %v", attempt, backoff)
}
select {
case <-time.After(backoff):
case <-ctx.Done():
return "", ctx.Err()
}
}

reqURL := buildParams(premiumProxy)
req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil)
if err != nil {
return "", fmt.Errorf("create request: %w", err)
}
// Forward a realistic Accept-Language header to Kickstarter.
req.Header.Set("Spb-Accept-Language", "en-US,en;q=0.9")

resp, err := c.httpClient.Do(req)
if err != nil {
Expand All @@ -110,7 +177,6 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use
}
defer resp.Body.Close()

// Check for rate limiting or server errors
if resp.StatusCode == 429 {
lastErr = fmt.Errorf("rate limited (429)")
continue
Expand All @@ -124,18 +190,16 @@ func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, use
return "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body))
}

// Read response
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("read response: %w", err)
}

// Log success
credits := resp.Header.Get("Spb-Cost")
log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v", targetURL, credits, useAI)
log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v, premium=%v", targetURL, credits, useAI, premiumProxy)

return string(body), nil
}

return "", fmt.Errorf("failed after 3 attempts: %w", lastErr)
return "", fmt.Errorf("failed after 4 attempts: %w", lastErr)
}
Loading