-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processing.py
More file actions
102 lines (87 loc) · 3.62 KB
/
data_processing.py
File metadata and controls
102 lines (87 loc) · 3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import logging
import re
import time
from urllib.error import URLError
import pandas as pd
from timeSpace import PROJECT_ROOT
logger = logging.getLogger(__name__)
def get_latest_filename(sheet_id, data_name):
"""Find the most recent cached CSV for a given Google Sheet.
Cache files are named: ``{data_name}___{unix_timestamp}___{sheet_id}.csv``
Parameters
----------
sheet_id : str
Google Sheets spreadsheet ID.
data_name : str
Human-readable dataset name (e.g. "processes").
Returns
-------
str
Filename of the most recent cache file.
"""
data_dir = PROJECT_ROOT / "data" / "datasets"
files = list(data_dir.glob("*.csv"))
all_filenames = [f.stem for f in files]
data_files = [x.split("___") for x in all_filenames if len(x.split("___")) > 2]
times = [x[1] for x in data_files if x[2] == sheet_id]
max_time = max(times, key=int)
return f"{data_name}___{max_time}___{sheet_id}.csv"
def extract_google_sheet(spreadsheet_id, data_name, from_cache=True, cache=False, sheet_id=0, **kwargs):
"""Fetch a Google Sheets spreadsheet as a pandas DataFrame.
Tries the local CSV cache first (``data/datasets/``). Falls back to
downloading from Google Sheets export URL. Optionally caches the
downloaded data for future use.
Parameters
----------
spreadsheet_id : str
Google Sheets spreadsheet ID (the part between /d/ and /edit in the URL).
data_name : str
Human-readable name for the dataset (used in cache filenames).
from_cache : bool
If True (default), try to load from local cache before downloading.
cache : bool
If True, save downloaded data to the cache directory.
sheet_id : int
Sheet GID for multi-sheet workbooks. Default 0 (first sheet).
**kwargs
Passed to ``pd.read_csv()``.
Returns
-------
DataFrame
"""
if not re.match(r"^[a-zA-Z0-9_-]+$", str(spreadsheet_id)):
raise ValueError(f"Invalid spreadsheet_id: {spreadsheet_id!r}")
data_dir = PROJECT_ROOT / "data" / "datasets"
if from_cache:
try:
filename = get_latest_filename(spreadsheet_id, data_name)
file_path = data_dir / filename
df = pd.read_csv(file_path, **kwargs)
logger.info("Pulling data from cache: %s", file_path)
except (FileNotFoundError, ValueError, KeyError) as e:
logger.info("%s\nNo cache available for: %s\n%s, trying to pull from source", e, data_name, spreadsheet_id)
df = extract_google_sheet(
spreadsheet_id, data_name, from_cache=False, sheet_id=sheet_id, cache=cache, **kwargs
)
else:
try:
if sheet_id == 0:
url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?format=csv"
else:
url = f"https://docs.google.com/spreadsheets/d/{spreadsheet_id}/export?gid={sheet_id}&format=csv"
df = pd.read_csv(url, **kwargs)
except URLError:
import certifi
import ssl
ctx = ssl.create_default_context(cafile=certifi.where())
import urllib.request as _urlreq
with _urlreq.urlopen(url, context=ctx) as response:
import io
df = pd.read_csv(io.BytesIO(response.read()), **kwargs)
logger.info("Data for %s pulled from: %s", data_name, url)
if cache:
filename = f"{data_name}___{int(time.time())}___{spreadsheet_id}.csv"
file_path = data_dir / filename
logger.info("Caching raw data to: %s", file_path)
df.to_csv(file_path)
return df