-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathrun.py
More file actions
119 lines (94 loc) · 3.75 KB
/
Copy pathrun.py
File metadata and controls
119 lines (94 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
ScoutBot main runner.
Usage:
python run.py # Full pipeline: scrape → cleanup closed → update sheet → email
python run.py --scrape # Only scrape (update sheet, no email)
python run.py --cleanup # Only remove closed opportunities from the sheet
python run.py --notify # Only send email (no scraping)
python run.py --schedule # Run on schedule: full pipeline at 7AM and 7PM daily
The full pipeline order is:
1. Scrape every source for new opportunities → adds new rows
2. Clean closed opportunities → removes expired rows
3. Send email digest → sends the live list
"""
import argparse
import logging
import os
import subprocess
import sys
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler(os.path.join(os.path.dirname(__file__), "scoutbot.log")),
],
)
logger = logging.getLogger(__name__)
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
SPIDERS = ["opportunities"]
def run_spider(spider_name):
logger.info(f"run.py: Starting spider '{spider_name}'...")
result = subprocess.run(
["scrapy", "crawl", spider_name, "--logfile", "scrapy.log"],
cwd=SCRIPT_DIR,
)
if result.returncode != 0:
logger.error(f"run.py: Spider '{spider_name}' exited with code {result.returncode}")
else:
logger.info(f"run.py: Spider '{spider_name}' done.")
def run_all_spiders():
for spider in SPIDERS:
run_spider(spider)
def run_cleanup():
"""Remove closed/expired opportunities from the Google Sheet."""
sys.path.insert(0, SCRIPT_DIR)
from cleanup import cleanup
cleanup()
def run_notify(dry_run=False):
"""Read the sheet and email the digest to all subscribers."""
sys.path.insert(0, SCRIPT_DIR)
from notify import run_notify as _run_notify
_run_notify(dry_run=dry_run)
def full_pipeline():
logger.info("run.py: === Full pipeline START ===")
run_all_spiders()
run_cleanup()
run_notify(dry_run=False)
logger.info("run.py: === Full pipeline COMPLETE ===")
def run_schedule():
import schedule
import time
# Always schedule in UTC so the bot fires at 07:00 and 19:00 WAT
# regardless of the server's local timezone.
# WAT (West Africa Time) = UTC+1, so:
# 07:00 WAT = 06:00 UTC
# 19:00 WAT = 18:00 UTC
logger.info("run.py: Scheduler started. Will run at 06:00 UTC (07:00 WAT) and 18:00 UTC (19:00 WAT) daily.")
schedule.every().day.at("06:00").do(full_pipeline) # 07:00 Nigeria time
schedule.every().day.at("18:00").do(full_pipeline) # 19:00 Nigeria time
# Run immediately on startup so first results appear right away
full_pipeline()
while True:
schedule.run_pending()
time.sleep(60)
def main():
parser = argparse.ArgumentParser(description="ScoutBot")
parser.add_argument("--scrape", action="store_true", help="Only scrape (update sheet, no email)")
parser.add_argument("--cleanup", action="store_true", help="Only remove closed opportunities from the sheet")
parser.add_argument("--notify", action="store_true", help="Only send email")
parser.add_argument("--dry-run", action="store_true", help="Build email_preview.html without sending")
parser.add_argument("--schedule", action="store_true", help="Run on schedule (7AM + 7PM daily)")
args = parser.parse_args()
if args.scrape:
run_all_spiders()
elif args.cleanup:
run_cleanup()
elif args.notify or args.dry_run:
run_notify(dry_run=args.dry_run)
elif args.schedule:
run_schedule()
else:
full_pipeline()
if __name__ == "__main__":
main()