-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_processing.py
More file actions
88 lines (71 loc) · 3.63 KB
/
data_processing.py
File metadata and controls
88 lines (71 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
from utils import parse_date, normalize_source
from transaction import Transaction
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def safe_convert_to_string(value):
try:
if pd.isna(value):
return ''
return str(value)
except Exception as e:
logger.warning(f"Error converting value to string: {e}")
return ''
def load_and_preprocess_data():
file_path_train = '/Users/rushant/SaaSDen/Categorization/First Attempt/PPX/new_general_ledger_refined.xlsx'
file_path_test = '/Users/rushant/SaaSDen/Categorization/First Attempt/PPX/new_rezults_ppx 2.xlsx'
logger.info("Loading data from Excel files")
train_data = pd.ExcelFile(file_path_train).parse(0)
test_data = pd.ExcelFile(file_path_test).parse(0)
for df in [train_data, test_data]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
if 'description' in df.columns:
df.rename(columns={'description': 'memo'}, inplace=True)
if 'Memo' in df.columns and 'memo' not in df.columns:
df.rename(columns={'Memo': 'memo'}, inplace=True)
if 'Name' in df.columns and 'name' not in df.columns:
df.rename(columns={'Name': 'name'}, inplace=True)
if 'purpose' not in df.columns:
df['purpose'] = ''
if 'generated_name' not in df.columns:
df['generated_name'] = ''
# Convert relevant columns to strings
string_columns = ['name', 'memo', 'purpose', 'generated_name', 'source']
for col in string_columns:
if col in df.columns:
df[col] = df[col].apply(safe_convert_to_string)
logger.info("Cleaning and preprocessing data")
train_data_cleaned = train_data.fillna({
'memo': '', 'name': '', 'amount': 0, 'purpose': '', 'generated_name': ''
})
test_data_cleaned = test_data.fillna({
'memo': '', 'name': '', 'amount': 0, 'purpose': '', 'generated_name': ''
})
train_data_cleaned['date'] = train_data_cleaned['date'].apply(parse_date)
test_data_cleaned['date'] = test_data_cleaned['date'].apply(parse_date)
train_data_cleaned['cleaned_memo'] = train_data_cleaned.apply(lambda row: Transaction(**row.to_dict()).cleaned_memo, axis=1)
train_data_cleaned['normalized_source'] = train_data_cleaned['source'].apply(normalize_source)
test_data_cleaned['normalized_source'] = test_data_cleaned['source'].apply(normalize_source)
logger.info(f"Preprocessed data: {len(train_data_cleaned)} train samples, {len(test_data_cleaned)} test samples")
return train_data_cleaned, test_data_cleaned
def group_transactions(data):
columns_to_use = [
'transaction_type', 'source', 'name', 'amount', 'date',
'purpose', 'generated_name'
]
if 'memo' in data.columns:
columns_to_use.append('memo')
elif 'description' in data.columns:
columns_to_use.append('description')
if 'split' in data.columns:
columns_to_use.append('split')
if 'cleaned_memo' in data.columns:
columns_to_use.append('cleaned_memo')
filtered_data = data[columns_to_use].copy()
if 'description' in filtered_data.columns and 'memo' not in filtered_data.columns:
filtered_data.rename(columns={'description': 'memo'}, inplace=True)
if 'cleaned_memo' in filtered_data.columns and filtered_data['memo'].isna().all():
filtered_data['memo'] = filtered_data['cleaned_memo']
logger.info(f"Grouped transactions: {len(filtered_data)} rows, {len(columns_to_use)} columns")
return filtered_data