-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_loader.py
More file actions
81 lines (69 loc) · 3.34 KB
/
Copy pathdata_loader.py
File metadata and controls
81 lines (69 loc) · 3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import os
class DataLoader:
"""
Handles loading of data from Excel files and basic preprocessing.
"""
def __init__(self, file_path):
self.file_path = file_path
self.required_columns = [
'Date',
'Weekly_Sales_Units',
'Unit_Price_USD',
'Unit_Cost_USD',
'Current_Inventory_Units',
'Transport_Distance_km',
'Fuel_Price_Index',
'Holiday_Flag',
'Promotion_Flag'
]
def load_data(self):
"""
Loads data from the Excel or CSV file.
Returns:
pd.DataFrame: Loaded data.
"""
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"File not found: {self.file_path}")
try:
if self.file_path.endswith('.csv'):
df = pd.read_csv(self.file_path)
# Filter for Store 1 only to ensure single time-series continuity
if 'Store' in df.columns:
print("Filtering for Store 1 to maintain time-series integrity...")
df = df[df['Store'] == 1].copy()
# Map Walmart columns to Voltacore schema if it is the Walmart dataset
if 'Weekly_Sales' in df.columns:
print("Detected Walmart dataset. Synthesising missing Voltacore columns...")
df = df.rename(columns={
'Weekly_Sales': 'Weekly_Sales_Units',
'Fuel_Price': 'Fuel_Price_Index'
})
# Synthesize missing columns
if 'Unit_Price_USD' not in df.columns:
df['Unit_Price_USD'] = 50.0 # Synthetic constant
if 'Unit_Cost_USD' not in df.columns:
df['Unit_Cost_USD'] = 30.0 # Synthetic constant
if 'Current_Inventory_Units' not in df.columns:
# Assume inventory is slightly higher than sales
df['Current_Inventory_Units'] = df['Weekly_Sales_Units'] * 1.1
if 'Transport_Distance_km' not in df.columns:
df['Transport_Distance_km'] = 500.0
if 'Promotion_Flag' not in df.columns:
df['Promotion_Flag'] = 0 # Default to 0
# Fix Date format (dd-mm-yyyy usually in this dataset)
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
else:
df = pd.read_excel(self.file_path)
# Check for missing columns (after synthesis)
missing_cols = [col for col in self.required_columns if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Ensure Date is datetime (if not already handled)
if not pd.api.types.is_datetime64_any_dtype(df['Date']):
df['Date'] = pd.to_datetime(df['Date'])
# Sort by date just in case
df = df.sort_values('Date').reset_index(drop=True)
return df
except Exception as e:
raise Exception(f"Error loading data: {str(e)}")