forked from ayushedu/creditcard_analyzer
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.py
More file actions
148 lines (119 loc) · 4.8 KB
/
parser.py
File metadata and controls
148 lines (119 loc) · 4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 3 18:59:05 2017
@author: Ayush Vatsyayan
"""
import os
import sys
import pyparsing as pp
import re
import pandas as pd
import locale
import ConfigParser
#==============================================================================
# Convert PDF to text using Apache PDFBox
#==============================================================================
def pdf_to_text():
global config
# Prepare command
cmd = config.get("PDF","PDFBOX_COMMAND") + " " + config.get("PDF","PASSWORD")
cmd += " " + config.get("PDF","PDF_FILE_PATH") + " tmp.txt"
# Execute command
resp = os.system(cmd)
if resp != 0:
print "Error converting PDF to txt"
sys.exit(resp)
# Read data
textfile = open("tmp.txt")
lines = textfile.readlines()
textfile.close() #close file
# Remove tmp.txt
os.remove("tmp.txt")
return lines
#==============================================================================
# Initialize
#==============================================================================
def init():
#Read configuration
global config
config = ConfigParser.ConfigParser()
config.read("config.ini")
# Pattern to parse the statement
transaction_id = pp.Word(pp.nums) + pp.Suppress(pp.White())
balance = pp.Combine(pp.Word(pp.nums + ",") + "." + pp.Word(pp.nums, exact=2) + pp.Optional('CR'))
date_pattern = pp.Word(pp.nums, exact=6)
merchant = pp.restOfLine()
global transactional_pattern
global non_transactional_pattern
transactional_pattern = transaction_id + balance + date_pattern + merchant
non_transactional_pattern = balance + date_pattern + merchant
#==============================================================================
# Parse records that have transaction id
#==============================================================================
def parse_transactions(lines):
global transactional_pattern
global non_transactional_pattern
# Initialize dictionary
stmt_dict = {'transaction_id':[], 'balance':[], 'date':[],'merchant':[] }
for line in lines:
try:
result = pp.OneOrMore(transactional_pattern).parseString(line)
# Append result to dictionary
stmt_dict['transaction_id'].append(result[0])
stmt_dict['balance'].append(result[1])
stmt_dict['date'].append(result[2])
stmt_dict['merchant'].append(result[3])
except pp.ParseException:
# In case of exception parse Non-transactional
try:
result = non_transactional_pattern.parseString(line)
stmt_dict['transaction_id'].append('')
stmt_dict['balance'].append(result[0])
stmt_dict['date'].append(result[1])
stmt_dict['merchant'].append(result[2])
except pp.ParseException:
# Skip, as lines aren't valid
pass
# Create DataFrame
stmt_df = pd.DataFrame(stmt_dict)
# Convert CR to negative balance
stmt_df.balance = [ '-' + v.replace('CR','').strip() if v.endswith('CR') else v.strip() for v in stmt_df.balance]
# Remove extra spaces from merchant
stmt_df.merchant = [re.sub("\s\s+" , " ",v) for v in stmt_df.merchant]
# Change balance to float
locale.setlocale(locale.LC_ALL,'en_US.UTF-8')
#locale.setlocale(locale.LC_NUMERIC, '')
stmt_df.balance = stmt_df.balance.apply(locale.atof)
# Arrange column order
stmt_df = stmt_df[['date','transaction_id','merchant','balance']]
return stmt_df
#==============================================================================
# Write the results to Excel
#==============================================================================
def write_excel(stmt_df):
# Group the data
by_merchant = stmt_df.groupby(["merchant","date"])
summary_df = by_merchant.sum()
# Write result
global config
output_file_name = config.get("OUTPUT","FILE_NAME_PREFIX") + "_" +str(stmt_df.date[0][2:]) + ".xlsx"
writer = pd.ExcelWriter(output_file_name,engine='xlsxwriter')
stmt_df.to_excel(writer, sheet_name='detailed')
summary_df.to_excel(writer, sheet_name='summary')
writer.save()
print "Results written successfully as", output_file_name
#==============================================================================
# Main function
#==============================================================================
if __name__ == '__main__':
# initialize
print ("Initializing1..")
init()
# read data
print("Reading PDF...")
lines = pdf_to_text()
# Parse data
print("Parsing data...")
stmt_df = parse_transactions(lines)
# write results
write_excel(stmt_df)