Foremind/data_loader.py at main · AagmanS/Foremind · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import os

class DataLoader:
    """
    Handles loading of data from Excel files and basic preprocessing.
    """
    def __init__(self, file_path):
        self.file_path = file_path
        self.required_columns = [
            'Date',
            'Weekly_Sales_Units',
            'Unit_Price_USD',
            'Unit_Cost_USD',
            'Current_Inventory_Units',
            'Transport_Distance_km',
            'Fuel_Price_Index',
            'Holiday_Flag',
            'Promotion_Flag'
        ]

    def load_data(self):
        """
        Loads data from the Excel or CSV file.

        Returns:
            pd.DataFrame: Loaded data.
        """
        if not os.path.exists(self.file_path):
            raise FileNotFoundError(f"File not found: {self.file_path}")

        try:
            if self.file_path.endswith('.csv'):
                df = pd.read_csv(self.file_path)

                # Filter for Store 1 only to ensure single time-series continuity
                if 'Store' in df.columns:
                    print("Filtering for Store 1 to maintain time-series integrity...")
                    df = df[df['Store'] == 1].copy()

                # Map Walmart columns to Voltacore schema if it is the Walmart dataset
                if 'Weekly_Sales' in df.columns:
                    print("Detected Walmart dataset. Synthesising missing Voltacore columns...")
                    df = df.rename(columns={
                        'Weekly_Sales': 'Weekly_Sales_Units',
                        'Fuel_Price': 'Fuel_Price_Index'
                    })

                    # Synthesize missing columns
                    if 'Unit_Price_USD' not in df.columns:
                        df['Unit_Price_USD'] = 50.0 # Synthetic constant
                    if 'Unit_Cost_USD' not in df.columns:
                        df['Unit_Cost_USD'] = 30.0 # Synthetic constant
                    if 'Current_Inventory_Units' not in df.columns:
                        # Assume inventory is slightly higher than sales
                        df['Current_Inventory_Units'] = df['Weekly_Sales_Units'] * 1.1
                    if 'Transport_Distance_km' not in df.columns:
                        df['Transport_Distance_km'] = 500.0
                    if 'Promotion_Flag' not in df.columns:
                        df['Promotion_Flag'] = 0 # Default to 0

                    # Fix Date format (dd-mm-yyyy usually in this dataset)
                    df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
            else:
                df = pd.read_excel(self.file_path)

            # Check for missing columns (after synthesis)
            missing_cols = [col for col in self.required_columns if col not in df.columns]
            if missing_cols:
                raise ValueError(f"Missing required columns: {missing_cols}")

            # Ensure Date is datetime (if not already handled)
            if not pd.api.types.is_datetime64_any_dtype(df['Date']):
                df['Date'] = pd.to_datetime(df['Date'])

            # Sort by date just in case
            df = df.sort_values('Date').reset_index(drop=True)

            return df
        except Exception as e:
            raise Exception(f"Error loading data: {str(e)}")