diff --git a/.github/workflows/deploy-backend.yml b/.github/workflows/deploy-backend.yml index 41bfc32..cab2a4b 100644 --- a/.github/workflows/deploy-backend.yml +++ b/.github/workflows/deploy-backend.yml @@ -112,7 +112,7 @@ jobs: echo "apns_team_id_arn=$(get_arn ${SECRET_PREFIX}/apns-team-id)" >> $GITHUB_OUTPUT echo "apns_bundle_id_arn=$(get_arn ${SECRET_PREFIX}/apns-bundle-id)" >> $GITHUB_OUTPUT echo "apns_key_arn=$(get_arn ${SECRET_PREFIX}/apns-key)" >> $GITHUB_OUTPUT - echo "webshare_proxy_url_arn=$(get_arn ${SECRET_PREFIX}/webshare-proxy-url)" >> $GITHUB_OUTPUT + echo "scrapingbee_api_key_arn=$(get_arn ${SECRET_PREFIX}/scrapingbee-api-key)" >> $GITHUB_OUTPUT - name: Generate ECS task definition env: @@ -142,12 +142,12 @@ jobs: { "name": "APNS_ENV", "value": "${{ env.IS_PROD == 'true' && 'production' || 'sandbox' }}" } ], "secrets": [ - { "name": "DATABASE_URL", "valueFrom": "${{ steps.secrets.outputs.db_arn }}" }, - { "name": "APNS_KEY_ID", "valueFrom": "${{ steps.secrets.outputs.apns_key_id_arn }}" }, - { "name": "APNS_TEAM_ID", "valueFrom": "${{ steps.secrets.outputs.apns_team_id_arn }}" }, - { "name": "APNS_BUNDLE_ID", "valueFrom": "${{ steps.secrets.outputs.apns_bundle_id_arn }}" }, - { "name": "APNS_KEY", "valueFrom": "${{ steps.secrets.outputs.apns_key_arn }}" }, - { "name": "WEBSHARE_PROXY_URL", "valueFrom": "${{ steps.secrets.outputs.webshare_proxy_url_arn }}" } + { "name": "DATABASE_URL", "valueFrom": "${{ steps.secrets.outputs.db_arn }}" }, + { "name": "APNS_KEY_ID", "valueFrom": "${{ steps.secrets.outputs.apns_key_id_arn }}" }, + { "name": "APNS_TEAM_ID", "valueFrom": "${{ steps.secrets.outputs.apns_team_id_arn }}" }, + { "name": "APNS_BUNDLE_ID", "valueFrom": "${{ steps.secrets.outputs.apns_bundle_id_arn }}" }, + { "name": "APNS_KEY", "valueFrom": "${{ steps.secrets.outputs.apns_key_arn }}" }, + { "name": "SCRAPINGBEE_API_KEY", "valueFrom": "${{ steps.secrets.outputs.scrapingbee_api_key_arn }}" } ], "readonlyRootFilesystem": true, "linuxParameters": { "initProcessEnabled": true }, diff --git a/backend/.env.example b/backend/.env.example index 1671577..2ac015f 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -11,3 +11,7 @@ APNS_TEAM_ID=YOUR_TEAM_ID APNS_BUNDLE_ID=com.yourname.kickwatch APNS_KEY_PATH=/secrets/apns.p8 APNS_ENV=sandbox + +# ScrapingBee Configuration +SCRAPINGBEE_API_KEY=your_api_key_here +SCRAPINGBEE_MAX_CONCURRENT=10 diff --git a/backend/cmd/api/main.go b/backend/cmd/api/main.go index 8690774..07a64db 100644 --- a/backend/cmd/api/main.go +++ b/backend/cmd/api/main.go @@ -18,6 +18,11 @@ func main() { cfg := config.Load() + // Validate required ScrapingBee API key + if cfg.ScrapingBeeAPIKey == "" { + log.Fatalf("SCRAPINGBEE_API_KEY is required but not set in environment") + } + if cfg.DatabaseURL != "" { if err := db.Init(cfg); err != nil { log.Fatalf("DB init: %v", err) @@ -26,8 +31,12 @@ func main() { log.Println("DATABASE_URL not set, running without database") } - graphClient := service.NewKickstarterGraphClient(cfg.ProxyURL) - restClient := service.NewKickstarterRESTClient(cfg.ProxyURL) + // Initialize ScrapingBee service + scrapingService := service.NewKickstarterScrapingService( + cfg.ScrapingBeeAPIKey, + cfg.ScrapingBeeMaxConcurrent, + ) + log.Printf("ScrapingBee service initialized (max concurrent: %d)", cfg.ScrapingBeeMaxConcurrent) var cronSvc *service.CronService if db.IsEnabled() { @@ -39,7 +48,7 @@ func main() { log.Printf("APNs init failed (push disabled): %v", err) } } - cronSvc = service.NewCronService(db.DB, restClient, apnsClient) + cronSvc = service.NewCronService(db.DB, scrapingService, apnsClient) cronSvc.Start() defer cronSvc.Stop() @@ -60,10 +69,10 @@ func main() { { api.GET("/health", handler.Health) - api.GET("/campaigns", handler.ListCampaigns(graphClient)) - api.GET("/campaigns/search", handler.SearchCampaigns(graphClient)) + api.GET("/campaigns", handler.ListCampaigns(scrapingService)) + api.GET("/campaigns/search", handler.SearchCampaigns(scrapingService)) api.GET("/campaigns/:pid", handler.GetCampaign) - api.GET("/categories", handler.ListCategories(graphClient)) + api.GET("/categories", handler.ListCategories(scrapingService)) api.POST("/devices/register", handler.RegisterDevice) diff --git a/backend/go.mod b/backend/go.mod index 513f5da..26ef8dd 100644 --- a/backend/go.mod +++ b/backend/go.mod @@ -3,6 +3,8 @@ module github.com/kickwatch/backend go 1.25.5 require ( + github.com/PuerkitoBio/goquery v1.11.0 // indirect + github.com/andybalholm/cascadia v1.3.3 // indirect github.com/bytedance/sonic v1.14.0 // indirect github.com/bytedance/sonic/loader v0.3.0 // indirect github.com/cloudwego/base64x v0.1.6 // indirect @@ -37,13 +39,13 @@ require ( github.com/ugorji/go/codec v1.3.0 // indirect go.uber.org/mock v0.5.0 // indirect golang.org/x/arch v0.20.0 // indirect - golang.org/x/crypto v0.40.0 // indirect - golang.org/x/mod v0.25.0 // indirect - golang.org/x/net v0.42.0 // indirect - golang.org/x/sync v0.16.0 // indirect - golang.org/x/sys v0.35.0 // indirect - golang.org/x/text v0.27.0 // indirect - golang.org/x/tools v0.34.0 // indirect + golang.org/x/crypto v0.44.0 // indirect + golang.org/x/mod v0.29.0 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/tools v0.38.0 // indirect google.golang.org/protobuf v1.36.9 // indirect gorm.io/driver/postgres v1.6.0 // indirect gorm.io/gorm v1.31.1 // indirect diff --git a/backend/go.sum b/backend/go.sum index 717a086..e41502c 100644 --- a/backend/go.sum +++ b/backend/go.sum @@ -1,3 +1,7 @@ +github.com/PuerkitoBio/goquery v1.11.0 h1:jZ7pwMQXIITcUXNH83LLk+txlaEy6NVOfTuP43xxfqw= +github.com/PuerkitoBio/goquery v1.11.0/go.mod h1:wQHgxUOU3JGuj3oD/QFfxUdlzW6xPHfqyHre6VMY4DQ= +github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= +github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= github.com/bytedance/sonic v1.14.0 h1:/OfKt8HFw0kh2rj8N0F6C/qPGRESq0BbaNZgcNXXzQQ= github.com/bytedance/sonic v1.14.0/go.mod h1:WoEbx8WTcFJfzCe0hbmyTGrfjt8PzNEBdxlNUO24NhA= github.com/bytedance/sonic/loader v0.3.0 h1:dskwH8edlzNMctoruo8FPTJDF3vLtDT0sXZwvZJyqeA= @@ -24,6 +28,7 @@ github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw= github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -74,25 +79,103 @@ github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08= github.com/ugorji/go/codec v1.3.0 h1:Qd2W2sQawAfG8XSvzwhBeoGq71zXOC/Q1E9y/wUcsUA= github.com/ugorji/go/codec v1.3.0/go.mod h1:pRBVtBSKl77K30Bv8R2P+cLSGaTtex6fsA2Wjqmfxj4= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.uber.org/mock v0.5.0 h1:KAMbZvZPyBPWgD14IrIQ38QCyjwpvVVV6K/bHl1IwQU= go.uber.org/mock v0.5.0/go.mod h1:ge71pBPLYDk7QIi1LupWxdAykm7KIEFchiOqd6z7qMM= golang.org/x/arch v0.20.0 h1:dx1zTU0MAE98U+TQ8BLl7XsJbgze2WnNKF/8tGp/Q6c= golang.org/x/arch v0.20.0/go.mod h1:bdwinDaKcfZUGpH09BB7ZmOfhalA8lQdzl62l8gGWsk= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= golang.org/x/crypto v0.40.0 h1:r4x+VvoG5Fm+eJcxMaY8CQM7Lb0l1lsmjGBQ6s8BfKM= golang.org/x/crypto v0.40.0/go.mod h1:Qr1vMER5WyS2dfPHAlsOj01wgLbsyWtFn/aY+5+ZdxY= +golang.org/x/crypto v0.44.0 h1:A97SsFvM3AIwEEmTBiaxPPTYpDC47w720rdiiUvgoAU= +golang.org/x/crypto v0.44.0/go.mod h1:013i+Nw79BMiQiMsOPcVCB5ZIJbYkerPrGnOa00tvmc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= golang.org/x/net v0.42.0 h1:jzkYrhi3YQWD6MLBJcsklgQsoAcw89EcZbJw8Z614hs= golang.org/x/net v0.42.0/go.mod h1:FF1RA5d3u7nAYA4z2TkclSCKh68eSXtiFwcWQpPXdt8= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.28.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.27.0/go.mod h1:iMsnZpn0cago0GOrHO2+Y7u7JPn5AylBrcoWkElMTSM= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.27.0 h1:4fGWRpyh641NLlecmyl4LOe6yDdfaYNrGb2zdfo4JV4= golang.org/x/text v0.27.0/go.mod h1:1D28KMCvyooCX9hBiosv5Tz/+YLxj0j7XhWjpSUF7CU= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/tools v0.34.0 h1:qIpSLOxeCYGg9TrcJokLBG4KFA6d795g0xkBkiESGlo= golang.org/x/tools v0.34.0/go.mod h1:pAP9OwEaY1CAW3HOmg3hLZC5Z0CCmzjAF2UQMSqNARg= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/protobuf v1.36.9 h1:w2gp2mA27hUeUzj9Ex9FBjsBm40zfaDtEWow293U7Iw= google.golang.org/protobuf v1.36.9/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/backend/internal/config/config.go b/backend/internal/config/config.go index aa39f14..21879fd 100644 --- a/backend/internal/config/config.go +++ b/backend/internal/config/config.go @@ -1,6 +1,9 @@ package config -import "os" +import ( + "os" + "strconv" +) type Config struct { DatabaseURL string @@ -11,7 +14,9 @@ type Config struct { APNSKeyPath string APNSKey string APNSEnv string - ProxyURL string + // ScrapingBee configuration + ScrapingBeeAPIKey string + ScrapingBeeMaxConcurrent int } func Load() *Config { @@ -23,15 +28,24 @@ func Load() *Config { if apnsEnv == "" { apnsEnv = "sandbox" } + + maxConcurrent := 10 // default + if val := os.Getenv("SCRAPINGBEE_MAX_CONCURRENT"); val != "" { + if parsed, err := strconv.Atoi(val); err == nil && parsed > 0 { + maxConcurrent = parsed + } + } + return &Config{ - DatabaseURL: os.Getenv("DATABASE_URL"), - Port: port, - APNSKeyID: os.Getenv("APNS_KEY_ID"), - APNSTeamID: os.Getenv("APNS_TEAM_ID"), - APNSBundleID: os.Getenv("APNS_BUNDLE_ID"), - APNSKeyPath: os.Getenv("APNS_KEY_PATH"), - APNSKey: os.Getenv("APNS_KEY"), - APNSEnv: apnsEnv, - ProxyURL: os.Getenv("WEBSHARE_PROXY_URL"), + DatabaseURL: os.Getenv("DATABASE_URL"), + Port: port, + APNSKeyID: os.Getenv("APNS_KEY_ID"), + APNSTeamID: os.Getenv("APNS_TEAM_ID"), + APNSBundleID: os.Getenv("APNS_BUNDLE_ID"), + APNSKeyPath: os.Getenv("APNS_KEY_PATH"), + APNSKey: os.Getenv("APNS_KEY"), + APNSEnv: apnsEnv, + ScrapingBeeAPIKey: os.Getenv("SCRAPINGBEE_API_KEY"), + ScrapingBeeMaxConcurrent: maxConcurrent, } } diff --git a/backend/internal/handler/campaigns.go b/backend/internal/handler/campaigns.go index e84c6ef..381671a 100644 --- a/backend/internal/handler/campaigns.go +++ b/backend/internal/handler/campaigns.go @@ -16,7 +16,7 @@ var sortMap = map[string]string{ "ending": "END_DATE", } -func ListCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFunc { +func ListCampaigns(client *service.KickstarterScrapingService) gin.HandlerFunc { return func(c *gin.Context) { sort := c.DefaultQuery("sort", "trending") categoryID := c.Query("category_id") @@ -44,7 +44,7 @@ func ListCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFunc gqlSort = "MAGIC" } - result, err := graphClient.Search("", categoryID, gqlSort, cursor, limit) + result, err := client.Search("", categoryID, gqlSort, cursor, limit) if err != nil { // fallback to DB if GraphQL fails if db.IsEnabled() { @@ -62,7 +62,7 @@ func ListCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFunc return } - nextCursor := "" + var nextCursor interface{} if result.HasNextPage { nextCursor = result.NextCursor } @@ -74,7 +74,7 @@ func ListCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFunc } } -func SearchCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFunc { +func SearchCampaigns(client *service.KickstarterScrapingService) gin.HandlerFunc { return func(c *gin.Context) { q := c.Query("q") if q == "" { @@ -84,13 +84,13 @@ func SearchCampaigns(graphClient *service.KickstarterGraphClient) gin.HandlerFun categoryID := c.Query("category_id") cursor := c.Query("cursor") - result, err := graphClient.Search(q, categoryID, "MAGIC", cursor, 20) + result, err := client.Search(q, categoryID, "MAGIC", cursor, 20) if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return } - nextCursor := "" + var nextCursor interface{} if result.HasNextPage { nextCursor = result.NextCursor } @@ -115,7 +115,7 @@ func GetCampaign(c *gin.Context) { c.JSON(http.StatusOK, campaign) } -func ListCategories(graphClient *service.KickstarterGraphClient) gin.HandlerFunc { +func ListCategories(client *service.KickstarterScrapingService) gin.HandlerFunc { return func(c *gin.Context) { if db.IsEnabled() { var cats []model.Category @@ -125,7 +125,7 @@ func ListCategories(graphClient *service.KickstarterGraphClient) gin.HandlerFunc } } - cats, err := graphClient.FetchCategories() + cats, err := client.FetchCategories() if err != nil { c.JSON(http.StatusInternalServerError, gin.H{"error": err.Error()}) return diff --git a/backend/internal/service/categories.go b/backend/internal/service/categories.go new file mode 100644 index 0000000..44f6979 --- /dev/null +++ b/backend/internal/service/categories.go @@ -0,0 +1,23 @@ +package service + +import "github.com/kickwatch/backend/internal/model" + +// kickstarterCategories is a hardcoded list of Kickstarter root categories +// These rarely change, so we avoid API calls by maintaining this static list +var kickstarterCategories = []model.Category{ + {ID: "1", Name: "Art"}, + {ID: "3", Name: "Comics"}, + {ID: "4", Name: "Crafts"}, + {ID: "5", Name: "Dance"}, + {ID: "6", Name: "Design"}, + {ID: "7", Name: "Fashion"}, + {ID: "9", Name: "Film & Video"}, + {ID: "10", Name: "Food"}, + {ID: "11", Name: "Games"}, + {ID: "12", Name: "Journalism"}, + {ID: "13", Name: "Music"}, + {ID: "14", Name: "Photography"}, + {ID: "15", Name: "Publishing"}, + {ID: "16", Name: "Technology"}, + {ID: "17", Name: "Theater"}, +} diff --git a/backend/internal/service/cron.go b/backend/internal/service/cron.go index b3a2848..e18d15a 100644 --- a/backend/internal/service/cron.go +++ b/backend/internal/service/cron.go @@ -16,18 +16,18 @@ var rootCategories = []string{ } type CronService struct { - db *gorm.DB - restClient *KickstarterRESTClient - apnsClient *APNsClient - scheduler *cron.Cron + db *gorm.DB + scrapingService *KickstarterScrapingService + apnsClient *APNsClient + scheduler *cron.Cron } -func NewCronService(db *gorm.DB, restClient *KickstarterRESTClient, apns *APNsClient) *CronService { +func NewCronService(db *gorm.DB, scrapingService *KickstarterScrapingService, apns *APNsClient) *CronService { return &CronService{ - db: db, - restClient: restClient, - apnsClient: apns, - scheduler: cron.New(cron.WithLocation(time.UTC)), + db: db, + scrapingService: scrapingService, + apnsClient: apns, + scheduler: cron.New(cron.WithLocation(time.UTC)), } } @@ -52,9 +52,9 @@ func (s *CronService) RunCrawlNow() error { for _, catID := range rootCategories { for page := 1; page <= 10; page++ { - campaigns, err := s.restClient.DiscoverCampaigns(catID, "newest", page) + campaigns, err := s.scrapingService.DiscoverCampaigns(catID, "newest", page) if err != nil { - log.Printf("Cron: REST error cat=%s page=%d: %v", catID, page, err) + log.Printf("Cron: ScrapingBee error cat=%s page=%d: %v", catID, page, err) break } if len(campaigns) == 0 { diff --git a/backend/internal/service/kickstarter_parser.go b/backend/internal/service/kickstarter_parser.go new file mode 100644 index 0000000..5847ea0 --- /dev/null +++ b/backend/internal/service/kickstarter_parser.go @@ -0,0 +1,236 @@ +package service + +import ( + "encoding/json" + "fmt" + "regexp" + "strconv" + "strings" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/kickwatch/backend/internal/model" +) + +// parseDiscoverPageHTML parses Kickstarter discover page HTML and extracts campaign data +func parseDiscoverPageHTML(html string) ([]model.Campaign, error) { + doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) + if err != nil { + return nil, fmt.Errorf("parse HTML: %w", err) + } + + var campaigns []model.Campaign + + // Find project cards - Kickstarter uses React components with data attributes + doc.Find("[data-project]").Each(func(i int, s *goquery.Selection) { + dataProject, exists := s.Attr("data-project") + if !exists { + return + } + + // Parse JSON data attribute + var projectData map[string]interface{} + if err := json.Unmarshal([]byte(dataProject), &projectData); err != nil { + return + } + + campaign := extractCampaignFromData(projectData) + if campaign.PID != "" { + campaigns = append(campaigns, campaign) + } + }) + + // Fallback: parse from HTML structure if no data attributes found + if len(campaigns) == 0 { + campaigns = parseFromHTMLStructure(doc) + } + + return campaigns, nil +} + +func extractCampaignFromData(data map[string]interface{}) model.Campaign { + campaign := model.Campaign{} + + // Extract PID + if pid, ok := data["id"].(float64); ok { + campaign.PID = fmt.Sprintf("%.0f", pid) + } else if pid, ok := data["id"].(string); ok { + campaign.PID = pid + } + + // Extract name + if name, ok := data["name"].(string); ok { + campaign.Name = name + } + + // Extract blurb + if blurb, ok := data["blurb"].(string); ok { + campaign.Blurb = blurb + } + + // Extract photo URL + if photo, ok := data["photo"].(map[string]interface{}); ok { + if url, ok := photo["full"].(string); ok { + campaign.PhotoURL = url + } else if url, ok := photo["1024x576"].(string); ok { + campaign.PhotoURL = url + } + } + + // Extract goal and pledged + if goal, ok := data["goal"].(float64); ok { + campaign.GoalAmount = goal + } + if pledged, ok := data["pledged"].(float64); ok { + campaign.PledgedAmount = pledged + } + + // Extract currency + if currency, ok := data["currency"].(string); ok { + campaign.GoalCurrency = currency + } + + // Extract deadline + if deadline, ok := data["deadline"].(float64); ok { + campaign.Deadline = time.Unix(int64(deadline), 0) + } + + // Extract state + if state, ok := data["state"].(string); ok { + campaign.State = state + } + + // Extract percent funded + if percentFunded, ok := data["percent_funded"].(float64); ok { + campaign.PercentFunded = percentFunded + } + + // Extract creator (name + slug for URL construction) + var creatorSlug string + if creator, ok := data["creator"].(map[string]interface{}); ok { + if name, ok := creator["name"].(string); ok { + campaign.CreatorName = name + } + if slug, ok := creator["slug"].(string); ok { + creatorSlug = slug + } + } + + // Extract category + if category, ok := data["category"].(map[string]interface{}); ok { + if id, ok := category["id"].(float64); ok { + campaign.CategoryID = fmt.Sprintf("%.0f", id) + } else if id, ok := category["id"].(string); ok { + campaign.CategoryID = id + } + if name, ok := category["name"].(string); ok { + campaign.CategoryName = name + } + } + + // Extract slug + if slug, ok := data["slug"].(string); ok { + campaign.Slug = slug + } + + // Build project URL - prefer canonical URL from urls.web.project + if urls, ok := data["urls"].(map[string]interface{}); ok { + if web, ok := urls["web"].(map[string]interface{}); ok { + if project, ok := web["project"].(string); ok { + campaign.ProjectURL = project + } + } + } + // Fallback: use creator slug + project slug (full path) when canonical URL is absent + if campaign.ProjectURL == "" && creatorSlug != "" && campaign.Slug != "" { + campaign.ProjectURL = fmt.Sprintf("https://www.kickstarter.com/projects/%s/%s", creatorSlug, campaign.Slug) + } + // Do not synthesize a URL from project slug alone — the result would be invalid + + return campaign +} + +func parseFromHTMLStructure(doc *goquery.Document) []model.Campaign { + var campaigns []model.Campaign + + // Look for project cards in various possible selectors + doc.Find(".js-react-proj-card, .project-card, [class*='ProjectCard']").Each(func(i int, s *goquery.Selection) { + campaign := model.Campaign{} + + // Try to extract from various possible structures + campaign.Name = s.Find("h3, .project-title, [class*='title']").First().Text() + campaign.Name = strings.TrimSpace(campaign.Name) + + campaign.Blurb = s.Find("p, .project-blurb, [class*='blurb']").First().Text() + campaign.Blurb = strings.TrimSpace(campaign.Blurb) + + // Extract image + if img := s.Find("img").First(); img.Length() > 0 { + if src, exists := img.Attr("src"); exists { + campaign.PhotoURL = src + } else if src, exists := img.Attr("data-src"); exists { + campaign.PhotoURL = src + } + } + + // Extract creator + campaign.CreatorName = s.Find(".creator, [class*='creator']").First().Text() + campaign.CreatorName = strings.TrimSpace(campaign.CreatorName) + + // Extract URL + if link := s.Find("a[href*='/projects/']").First(); link.Length() > 0 { + if href, exists := link.Attr("href"); exists { + campaign.ProjectURL = href + if !strings.HasPrefix(href, "http") { + campaign.ProjectURL = "https://www.kickstarter.com" + href + } + // Extract PID from URL + campaign.PID = extractPIDFromURL(href) + } + } + + if campaign.Name != "" && campaign.ProjectURL != "" { + campaigns = append(campaigns, campaign) + } + }) + + return campaigns +} + +func extractPIDFromURL(urlStr string) string { + // Extract project ID from URL like /projects/creator/project-name or /projects/123456789/project-name + re := regexp.MustCompile(`/projects/([^/]+)/([^/?]+)`) + matches := re.FindStringSubmatch(urlStr) + if len(matches) >= 3 { + // If first part is numeric, that's the PID + if _, err := strconv.ParseInt(matches[1], 10, 64); err == nil { + return matches[1] + } + // Otherwise use creator/slug combination + return matches[1] + "/" + matches[2] + } + return "" +} + +// parseGoalPledgedText parses text like "$50,000 pledged of $100,000 goal" +func parseGoalPledgedText(text string) (goal, pledged float64, currency string) { + // Match patterns like "$50,000" or "£1,234.56" + re := regexp.MustCompile(`([\$£€¥])?([\d,]+(?:\.\d{2})?)`) + matches := re.FindAllStringSubmatch(text, -1) + + if len(matches) >= 2 { + // First match is typically pledged, second is goal + currency = matches[0][1] + if currency == "" { + currency = "USD" + } + + pledgedStr := strings.ReplaceAll(matches[0][2], ",", "") + pledged, _ = strconv.ParseFloat(pledgedStr, 64) + + goalStr := strings.ReplaceAll(matches[1][2], ",", "") + goal, _ = strconv.ParseFloat(goalStr, 64) + } + + return +} diff --git a/backend/internal/service/kickstarter_scraping.go b/backend/internal/service/kickstarter_scraping.go new file mode 100644 index 0000000..29b3e90 --- /dev/null +++ b/backend/internal/service/kickstarter_scraping.go @@ -0,0 +1,243 @@ +package service + +import ( + "context" + "encoding/json" + "fmt" + "log" + "net/url" + "strconv" + "time" + + "github.com/kickwatch/backend/internal/model" +) + +type KickstarterScrapingService struct { + client *ScrapingBeeClient +} + +func NewKickstarterScrapingService(apiKey string, maxConcurrent int) *KickstarterScrapingService { + if maxConcurrent == 0 { + maxConcurrent = 10 // default + } + return &KickstarterScrapingService{ + client: NewScrapingBeeClient(apiKey, maxConcurrent), + } +} + +// Search searches for campaigns using AI extraction (10 credits per request) +func (s *KickstarterScrapingService) Search(term, categoryID, sort, cursor string, first int) (*SearchResult, error) { + ctx := context.Background() + + // Parse page from cursor (cursor format: "page:N") + page := 1 + if cursor != "" { + if _, err := fmt.Sscanf(cursor, "page:%d", &page); err != nil { + page = 1 + } + } + + // Build Kickstarter discover URL with page + discoverURL := s.buildDiscoverURL(term, categoryID, sort, page) + + // Try AI extraction first + aiQuery := "Extract all projects from this page. For each project return a JSON object with these fields: name, slug, creator_slug (the creator's URL slug, e.g. 'john-doe' from kickstarter.com/projects/john-doe/...), project_url (full canonical Kickstarter URL), goal, pledged, currency, deadline, creator, category, photo_url, blurb." + + aiResult, err := s.client.ExtractWithAI(ctx, discoverURL, aiQuery) + if err == nil { + campaigns, parseErr := s.parseAIResponse(aiResult) + if parseErr == nil && len(campaigns) > 0 { + log.Printf("AI extraction successful: found %d campaigns (page %d)", len(campaigns), page) + + // Generate next cursor if we got a full page + nextCursor := "" + hasNextPage := len(campaigns) >= first + if hasNextPage { + nextCursor = fmt.Sprintf("page:%d", page+1) + } + + return &SearchResult{ + Campaigns: campaigns, + TotalCount: len(campaigns), + NextCursor: nextCursor, + HasNextPage: hasNextPage, + }, nil + } + log.Printf("AI extraction parse failed: %v, falling back to HTML", parseErr) + } else { + log.Printf("AI extraction failed: %v, falling back to HTML", err) + } + + // Fallback to HTML parsing + html, err := s.client.FetchHTML(ctx, discoverURL) + if err != nil { + return nil, fmt.Errorf("fetch HTML: %w", err) + } + + campaigns, err := parseDiscoverPageHTML(html) + if err != nil { + return nil, fmt.Errorf("parse HTML: %w", err) + } + + log.Printf("HTML parsing successful: found %d campaigns (page %d)", len(campaigns), page) + + // Generate next cursor if we got a full page + nextCursor := "" + hasNextPage := len(campaigns) >= first + if hasNextPage { + nextCursor = fmt.Sprintf("page:%d", page+1) + } + + return &SearchResult{ + Campaigns: campaigns, + TotalCount: len(campaigns), + NextCursor: nextCursor, + HasNextPage: hasNextPage, + }, nil +} + +// DiscoverCampaigns fetches campaigns for a specific category using HTML parsing (5 credits) +func (s *KickstarterScrapingService) DiscoverCampaigns(categoryID string, sort string, page int) ([]model.Campaign, error) { + ctx := context.Background() + + // Build URL + discoverURL := s.buildDiscoverURL("", categoryID, sort, page) + + // Fetch HTML only (cheaper than AI extraction) + html, err := s.client.FetchHTML(ctx, discoverURL) + if err != nil { + return nil, fmt.Errorf("fetch HTML: %w", err) + } + + campaigns, err := parseDiscoverPageHTML(html) + if err != nil { + return nil, fmt.Errorf("parse HTML: %w", err) + } + + log.Printf("Discovered %d campaigns for category %s (page %d)", len(campaigns), categoryID, page) + + return campaigns, nil +} + +// FetchCategories returns hardcoded category list (0 credits) +func (s *KickstarterScrapingService) FetchCategories() ([]model.Category, error) { + return kickstarterCategories, nil +} + +func (s *KickstarterScrapingService) buildDiscoverURL(term, categoryID, sort string, page int) string { + baseURL := "https://www.kickstarter.com/discover/advanced" + + params := url.Values{} + + if term != "" { + params.Set("term", term) + } + + if categoryID != "" { + params.Set("category_id", categoryID) + } + + // Map sort values + switch sort { + case "MAGIC", "trending": + params.Set("sort", "magic") + case "NEWEST", "newest": + params.Set("sort", "newest") + case "END_DATE", "ending": + params.Set("sort", "end_date") + default: + params.Set("sort", "magic") + } + + if page > 1 { + params.Set("page", strconv.Itoa(page)) + } + + return fmt.Sprintf("%s?%s", baseURL, params.Encode()) +} + +func (s *KickstarterScrapingService) parseAIResponse(jsonData string) ([]model.Campaign, error) { + // Try to parse as array first + var campaigns []model.Campaign + if err := json.Unmarshal([]byte(jsonData), &campaigns); err == nil { + return campaigns, nil + } + + // Try to parse as object with projects field + var response struct { + Projects []struct { + Name string `json:"name"` + Slug string `json:"slug"` + CreatorSlug string `json:"creator_slug"` + ProjectURL string `json:"project_url"` + Goal float64 `json:"goal"` + Pledged float64 `json:"pledged"` + Currency string `json:"currency"` + Deadline string `json:"deadline"` + Creator string `json:"creator"` + Category string `json:"category"` + PhotoURL string `json:"photo_url"` + Blurb string `json:"blurb"` + BackersCount int `json:"backers_count"` + } `json:"projects"` + } + + if err := json.Unmarshal([]byte(jsonData), &response); err != nil { + return nil, fmt.Errorf("parse AI response: %w", err) + } + + for _, p := range response.Projects { + campaign := model.Campaign{ + Name: p.Name, + Slug: p.Slug, + GoalAmount: p.Goal, + PledgedAmount: p.Pledged, + GoalCurrency: p.Currency, + CreatorName: p.Creator, + CategoryName: p.Category, + PhotoURL: p.PhotoURL, + Blurb: p.Blurb, + } + + // Parse deadline + if p.Deadline != "" { + // Try various date formats + formats := []string{ + time.RFC3339, + "2006-01-02", + "Jan 2 2006", + "January 2, 2006", + } + for _, format := range formats { + if t, err := time.Parse(format, p.Deadline); err == nil { + campaign.Deadline = t + break + } + } + } + + // Use project URL from AI if provided, otherwise build from creator_slug + slug + if p.ProjectURL != "" { + campaign.ProjectURL = p.ProjectURL + campaign.PID = extractPIDFromURL(p.ProjectURL) + if campaign.PID == "" { + campaign.PID = campaign.Slug + } + } else if p.CreatorSlug != "" && campaign.Slug != "" { + campaign.ProjectURL = fmt.Sprintf("https://www.kickstarter.com/projects/%s/%s", p.CreatorSlug, campaign.Slug) + campaign.PID = campaign.Slug + } else if campaign.Slug != "" { + // Cannot construct a valid URL without the creator slug; leave ProjectURL empty + campaign.PID = campaign.Slug + } + + // Calculate percent funded + if campaign.GoalAmount > 0 { + campaign.PercentFunded = (campaign.PledgedAmount / campaign.GoalAmount) * 100 + } + + campaigns = append(campaigns, campaign) + } + + return campaigns, nil +} diff --git a/backend/internal/service/scrapingbee_client.go b/backend/internal/service/scrapingbee_client.go new file mode 100644 index 0000000..00f833d --- /dev/null +++ b/backend/internal/service/scrapingbee_client.go @@ -0,0 +1,141 @@ +package service + +import ( + "context" + "fmt" + "io" + "log" + "net/http" + "net/url" + "time" +) + +const scrapingBeeBaseURL = "https://app.scrapingbee.com/api/v1" + +type ScrapingBeeClient struct { + apiKey string + baseURL string + httpClient *http.Client + rateLimiter *RateLimiter +} + +type RateLimiter struct { + semaphore chan struct{} + requestDelay time.Duration +} + +func NewRateLimiter(maxConcurrent int, requestDelay time.Duration) *RateLimiter { + return &RateLimiter{ + semaphore: make(chan struct{}, maxConcurrent), + requestDelay: requestDelay, + } +} + +func (rl *RateLimiter) Acquire(ctx context.Context) error { + select { + case rl.semaphore <- struct{}{}: + return nil + case <-ctx.Done(): + return ctx.Err() + } +} + +func (rl *RateLimiter) Release() { + <-rl.semaphore + if rl.requestDelay > 0 { + time.Sleep(rl.requestDelay) + } +} + +func NewScrapingBeeClient(apiKey string, maxConcurrent int) *ScrapingBeeClient { + return &ScrapingBeeClient{ + apiKey: apiKey, + baseURL: scrapingBeeBaseURL, + httpClient: &http.Client{Timeout: 60 * time.Second}, + rateLimiter: NewRateLimiter(maxConcurrent, 500*time.Millisecond), + } +} + +// FetchHTML fetches raw HTML from a URL using ScrapingBee (5 credits) +func (c *ScrapingBeeClient) FetchHTML(ctx context.Context, targetURL string) (string, error) { + return c.doRequest(ctx, targetURL, false, "") +} + +// ExtractWithAI fetches and extracts data using AI (10 credits) +func (c *ScrapingBeeClient) ExtractWithAI(ctx context.Context, targetURL string, query string) (string, error) { + return c.doRequest(ctx, targetURL, true, query) +} + +func (c *ScrapingBeeClient) doRequest(ctx context.Context, targetURL string, useAI bool, aiQuery string) (string, error) { + // Rate limiting + if err := c.rateLimiter.Acquire(ctx); err != nil { + return "", fmt.Errorf("rate limiter: %w", err) + } + defer c.rateLimiter.Release() + + // Build ScrapingBee API URL + params := url.Values{} + params.Set("api_key", c.apiKey) + params.Set("url", targetURL) + params.Set("render_js", "true") + + if useAI && aiQuery != "" { + params.Set("ai_query", aiQuery) + } + + reqURL := fmt.Sprintf("%s?%s", c.baseURL, params.Encode()) + + // Retry logic + var lastErr error + for attempt := 0; attempt < 3; attempt++ { + if attempt > 0 { + backoff := time.Duration(attempt) * 2 * time.Second + log.Printf("ScrapingBee retry attempt %d after %v", attempt+1, backoff) + select { + case <-time.After(backoff): + case <-ctx.Done(): + return "", ctx.Err() + } + } + + req, err := http.NewRequestWithContext(ctx, "GET", reqURL, nil) + if err != nil { + return "", fmt.Errorf("create request: %w", err) + } + + resp, err := c.httpClient.Do(req) + if err != nil { + lastErr = fmt.Errorf("http request: %w", err) + continue + } + defer resp.Body.Close() + + // Check for rate limiting or server errors + if resp.StatusCode == 429 { + lastErr = fmt.Errorf("rate limited (429)") + continue + } + if resp.StatusCode >= 500 { + lastErr = fmt.Errorf("server error (%d)", resp.StatusCode) + continue + } + if resp.StatusCode != 200 { + body, _ := io.ReadAll(resp.Body) + return "", fmt.Errorf("unexpected status %d: %s", resp.StatusCode, string(body)) + } + + // Read response + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read response: %w", err) + } + + // Log success + credits := resp.Header.Get("Spb-Cost") + log.Printf("ScrapingBee success: url=%s, credits=%s, useAI=%v", targetURL, credits, useAI) + + return string(body), nil + } + + return "", fmt.Errorf("failed after 3 attempts: %w", lastErr) +}