Skip to content

Instantly share code, notes, and snippets.

@mzhukovs
Created September 8, 2018 13:05
Show Gist options
  • Save mzhukovs/c432f9749053debea1b7ae95c601f4fa to your computer and use it in GitHub Desktop.
Save mzhukovs/c432f9749053debea1b7ae95c601f4fa to your computer and use it in GitHub Desktop.
Python Script to Scrape Transaction Records from United Overseas Bank CASA Statement PDFs
# Parses UOB CASA eStatements transaction data
#! first download tika-server from http://www.apache.org/dyn/closer.cgi/tika/tika-server-1.18.jar and then go to cmd:
#* cd C:\Users\YourUserName\Documents\Python Scripts\_util # or wherever you installed it
#* java -jar tika-server-1.18.jar --port 1111
#region Imports
import time
import datetime
import pandas as pd
import regex as re
import glob
import tika
from tika import parser
#endregion
#region Settings
tika.TikaClientOnly = True
tikaServerPath = "http://localhost:2929/"
rootDir = "eStatements"
subDir = "Uniplus"
fileNameStartsWith = "eStatement"
amountsPlaceholder = "numstart"
#endregion
#region UDFs
def getDescription(stringy):
lines = stringy[0:stringy.find(amountsPlaceholder)].split('\n')
note = lines[0]
if (note[0:4] == 'Cash'):
note = 'Cash'
elif (note.find('Cheque Deposit') > -1):
note = 'Cheque Deposit'
elif (note.find('Interest Credit') > -1):
note = 'Interest Credit'
if (len(lines) == 1):
return (note, '')
# remove the note now that we have it captured
del lines[0]
# only applies if you have USD transactions; append original rate to note description
if (lines[0][0:3] == 'USD'):
note += ' ' + lines[0]
del lines[0]
if (bool(re.match(transactionRowPattern, lines[0]))):
del lines[0]
# get rid of [sometimes trailing] transaction number that sometimes appears on last line (optional)
#if (lines[-1].isdigit()):
# del lines[-1]
recipient = ''.join(lines)
return (note, recipient)
def convertToFloat(stringy):
try:
floatie = float(stringy[stringy.find(amountsPlaceholder)+8:].split(' ')[-1].replace(',', ''))
except ValueError:
return None
else:
return floatie
#endregion
# pull in PDF files
pdf_files = glob.glob(f"{rootDir}/{subDir}/{fileNameStartsWith}*.pdf")
# declare patterns used for regex splitting/matching
transactionRowPDelimiters = " \n\n|SGD \n\n" # all transaction rows will be split by 2 spaces followed by 2 newlines EXCEPT the FIRST ones on a page will only have 1 space and be preceded by SGD
transactionRowPattern = re.compile("\\d{2}\\s\\w{3}\\s") #e.g. 13 Mar
statementYearPattern = re.compile("\nPeriod: .*\n{2}") #Period: 01 Mar 2015 to 31 Mar 2015
# initialize empty list to house our final results
transactionList = []
for pdf_file in pdf_files:
raw = parser.from_file(pdf_file, tikaServerPath)
text = raw['content'] #.encode("utf-8")
year = re.findall(statementYearPattern, text)[0].replace('\n\n', '')[-4:] #! note that match only checks as of the BEGINNING of the string, so need to use search or findall
transactions = pd.DataFrame(re.split(transactionRowPDelimiters, text))
transactions.columns = ['rawRow']
transactions['Year'] = year
# filter on actual transaction rows
transactions = transactions[transactions.rawRow.apply(lambda x: bool(re.match(transactionRowPattern, x)))]
#region Parse Transaction Row Data into Fields
transactions['MonthDay'] = transactions.rawRow.apply(lambda x: x[0:6])
transactions['Date'] = transactions.MonthDay + ' ' + transactions.Year
transactions['rawRow'] = transactions.rawRow.apply(lambda x: x[7:].replace('\n\n', amountsPlaceholder))
transactions['Description'] = transactions.rawRow.apply(lambda x: getDescription(x))
transactions['Note'] = transactions.Description.apply(lambda x: x[0])
transactions['Payee'] = transactions.Description.apply(lambda x: x[1])
transactions['Balance'] = transactions.rawRow.apply(lambda x: convertToFloat(x))
transactions.reset_index(drop=True, inplace=True)
statementBegBalance = transactions.iloc[0].Balance # first row is starting balance
transactions.drop(transactions.index[0], inplace=True) # drop it after capturing, note NOT re-setting the index so now it starts at 1
# calculate transaction amount
transactions['Amount'] = 0.00
transactions['Amount'][1] = transactions['Balance'][1] - statementBegBalance
for i in range(2, len(transactions) + 1):
transactions.Amount[i] = transactions.Balance[i] - transactions.Balance[i-1]
# drop stuff we don't need in the output
del transactions['rawRow']
del transactions['Year']
del transactions['MonthDay']
del transactions['Description']
# append to master list
transactionList.append(transactions)
#endregion
#region Output to CSV
transactions = pd.concat(transactionList)
transactions['Date'] = pd.to_datetime(transactions.Date)
transactions = transactions.reset_index(drop=True)
transactions['Indexx'] = transactions.index
transactions.sort_values(by=['Date', 'Indexx'], inplace=True)
del transactions['Indexx']
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H.%M.%S')
transactions.to_csv('output' + st + '.csv', index=False)
print('Success!')
#endregion
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment