mzhukovs/UOB_BankStatement_CASA_PDF_Parser.py

## UOB_BankStatement_CASA_PDF_Parser.py
# Parses UOB CASA eStatements transaction data

#! first download tika-server from http://www.apache.org/dyn/closer.cgi/tika/tika-server-1.18.jar and then go to cmd:
#* cd C:\Users\YourUserName\Documents\Python Scripts\_util # or wherever you installed it
#* java -jar tika-server-1.18.jar --port 1111

#region Imports
import time
import datetime
import pandas as pd
import regex as re
import glob

import tika
from tika import parser
#endregion

#region Settings
tika.TikaClientOnly = True
tikaServerPath = "http://localhost:2929/"
rootDir = "eStatements"
subDir = "Uniplus"
fileNameStartsWith = "eStatement"
amountsPlaceholder = "numstart"
#endregion

#region UDFs
def getDescription(stringy):

    lines = stringy[0:stringy.find(amountsPlaceholder)].split('\n')
    note = lines[0]

    if (note[0:4] == 'Cash'):
        note = 'Cash'
    elif (note.find('Cheque Deposit') > -1):
        note = 'Cheque Deposit'
    elif (note.find('Interest Credit') > -1):
        note = 'Interest Credit'

    if (len(lines) == 1):
        return (note, '')

    # remove the note now that we have it captured
    del lines[0]

    # only applies if you have USD transactions; append original rate to note description
    if (lines[0][0:3] == 'USD'):
        note += ' ' + lines[0]
        del lines[0]

    if (bool(re.match(transactionRowPattern, lines[0]))):
        del lines[0]

    # get rid of [sometimes trailing] transaction number that sometimes appears on last line (optional)
    #if (lines[-1].isdigit()):
    #    del lines[-1]

    recipient = ''.join(lines)

    return (note, recipient)

def convertToFloat(stringy):
    try:
        floatie = float(stringy[stringy.find(amountsPlaceholder)+8:].split(' ')[-1].replace(',', ''))
    except ValueError:
        return None
    else:
        return floatie
#endregion

# pull in PDF files
pdf_files = glob.glob(f"{rootDir}/{subDir}/{fileNameStartsWith}*.pdf")

# declare patterns used for regex splitting/matching
transactionRowPDelimiters = "  \n\n|SGD \n\n"  # all transaction rows will be split by 2 spaces followed by 2 newlines EXCEPT the FIRST ones on a page will only have 1 space and be preceded by SGD
transactionRowPattern = re.compile("\\d{2}\\s\\w{3}\\s") #e.g. 13 Mar
statementYearPattern = re.compile("\nPeriod: .*\n{2}") #Period: 01 Mar 2015 to 31 Mar 2015

# initialize empty list to house our final results
transactionList = []

for pdf_file in pdf_files:

    raw = parser.from_file(pdf_file, tikaServerPath)
    text = raw['content'] #.encode("utf-8")

    year = re.findall(statementYearPattern, text)[0].replace('\n\n', '')[-4:] #! note that match only checks as of the BEGINNING of the string, so need to use search or findall
    transactions = pd.DataFrame(re.split(transactionRowPDelimiters, text))
    transactions.columns = ['rawRow']
    transactions['Year'] = year

    # filter on actual transaction rows
    transactions = transactions[transactions.rawRow.apply(lambda x: bool(re.match(transactionRowPattern, x)))]

    #region Parse Transaction Row Data into Fields

    transactions['MonthDay'] = transactions.rawRow.apply(lambda x: x[0:6])
    transactions['Date'] = transactions.MonthDay + ' ' + transactions.Year
    transactions['rawRow'] = transactions.rawRow.apply(lambda x: x[7:].replace('\n\n', amountsPlaceholder))

    transactions['Description'] = transactions.rawRow.apply(lambda x: getDescription(x))
    transactions['Note'] = transactions.Description.apply(lambda x: x[0])
    transactions['Payee'] = transactions.Description.apply(lambda x: x[1])

    transactions['Balance'] = transactions.rawRow.apply(lambda x: convertToFloat(x))

    transactions.reset_index(drop=True, inplace=True)

    statementBegBalance = transactions.iloc[0].Balance # first row is starting balance
    transactions.drop(transactions.index[0], inplace=True) # drop it after capturing, note NOT re-setting the index so now it starts at 1

    # calculate transaction amount
    transactions['Amount'] = 0.00
    transactions['Amount'][1] = transactions['Balance'][1] - statementBegBalance

    for i in range(2, len(transactions) + 1):
        transactions.Amount[i] = transactions.Balance[i] - transactions.Balance[i-1]

    # drop stuff we don't need in the output
    del transactions['rawRow']
    del transactions['Year']
    del transactions['MonthDay']
    del transactions['Description']

    # append to master list
    transactionList.append(transactions)

    #endregion

#region Output to CSV

transactions = pd.concat(transactionList)

transactions['Date'] = pd.to_datetime(transactions.Date)
transactions = transactions.reset_index(drop=True)
transactions['Indexx'] = transactions.index
transactions.sort_values(by=['Date', 'Indexx'], inplace=True)
del transactions['Indexx']

ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H.%M.%S')
transactions.to_csv('output' + st + '.csv', index=False)
print('Success!')
#endregion
	# Parses UOB CASA eStatements transaction data

	#! first download tika-server from http://www.apache.org/dyn/closer.cgi/tika/tika-server-1.18.jar and then go to cmd:
	#* cd C:\Users\YourUserName\Documents\Python Scripts\_util # or wherever you installed it
	#* java -jar tika-server-1.18.jar --port 1111

	#region Imports
	import time
	import datetime
	import pandas as pd
	import regex as re
	import glob

	import tika
	from tika import parser
	#endregion

	#region Settings
	tika.TikaClientOnly = True
	tikaServerPath = "http://localhost:2929/"
	rootDir = "eStatements"
	subDir = "Uniplus"
	fileNameStartsWith = "eStatement"
	amountsPlaceholder = "numstart"
	#endregion

	#region UDFs
	def getDescription(stringy):

	lines = stringy[0:stringy.find(amountsPlaceholder)].split('\n')
	note = lines[0]

	if (note[0:4] == 'Cash'):
	note = 'Cash'
	elif (note.find('Cheque Deposit') > -1):
	note = 'Cheque Deposit'
	elif (note.find('Interest Credit') > -1):
	note = 'Interest Credit'

	if (len(lines) == 1):
	return (note, '')

	# remove the note now that we have it captured
	del lines[0]

	# only applies if you have USD transactions; append original rate to note description
	if (lines[0][0:3] == 'USD'):
	note += ' ' + lines[0]
	del lines[0]

	if (bool(re.match(transactionRowPattern, lines[0]))):
	del lines[0]

	# get rid of [sometimes trailing] transaction number that sometimes appears on last line (optional)
	#if (lines[-1].isdigit()):
	# del lines[-1]

	recipient = ''.join(lines)

	return (note, recipient)

	def convertToFloat(stringy):
	try:
	floatie = float(stringy[stringy.find(amountsPlaceholder)+8:].split(' ')[-1].replace(',', ''))
	except ValueError:
	return None
	else:
	return floatie
	#endregion

	# pull in PDF files
	pdf_files = glob.glob(f"{rootDir}/{subDir}/{fileNameStartsWith}*.pdf")

	# declare patterns used for regex splitting/matching
	transactionRowPDelimiters = " \n\n\|SGD \n\n" # all transaction rows will be split by 2 spaces followed by 2 newlines EXCEPT the FIRST ones on a page will only have 1 space and be preceded by SGD
	transactionRowPattern = re.compile("\\d{2}\\s\\w{3}\\s") #e.g. 13 Mar
	statementYearPattern = re.compile("\nPeriod: .*\n{2}") #Period: 01 Mar 2015 to 31 Mar 2015

	# initialize empty list to house our final results
	transactionList = []

	for pdf_file in pdf_files:

	raw = parser.from_file(pdf_file, tikaServerPath)
	text = raw['content'] #.encode("utf-8")

	year = re.findall(statementYearPattern, text)[0].replace('\n\n', '')[-4:] #! note that match only checks as of the BEGINNING of the string, so need to use search or findall
	transactions = pd.DataFrame(re.split(transactionRowPDelimiters, text))
	transactions.columns = ['rawRow']
	transactions['Year'] = year

	# filter on actual transaction rows
	transactions = transactions[transactions.rawRow.apply(lambda x: bool(re.match(transactionRowPattern, x)))]

	#region Parse Transaction Row Data into Fields

	transactions['MonthDay'] = transactions.rawRow.apply(lambda x: x[0:6])
	transactions['Date'] = transactions.MonthDay + ' ' + transactions.Year
	transactions['rawRow'] = transactions.rawRow.apply(lambda x: x[7:].replace('\n\n', amountsPlaceholder))

	transactions['Description'] = transactions.rawRow.apply(lambda x: getDescription(x))
	transactions['Note'] = transactions.Description.apply(lambda x: x[0])
	transactions['Payee'] = transactions.Description.apply(lambda x: x[1])

	transactions['Balance'] = transactions.rawRow.apply(lambda x: convertToFloat(x))

	transactions.reset_index(drop=True, inplace=True)

	statementBegBalance = transactions.iloc[0].Balance # first row is starting balance
	transactions.drop(transactions.index[0], inplace=True) # drop it after capturing, note NOT re-setting the index so now it starts at 1

	# calculate transaction amount
	transactions['Amount'] = 0.00
	transactions['Amount'][1] = transactions['Balance'][1] - statementBegBalance

	for i in range(2, len(transactions) + 1):
	transactions.Amount[i] = transactions.Balance[i] - transactions.Balance[i-1]

	# drop stuff we don't need in the output
	del transactions['rawRow']
	del transactions['Year']
	del transactions['MonthDay']
	del transactions['Description']

	# append to master list
	transactionList.append(transactions)

	#endregion

	#region Output to CSV

	transactions = pd.concat(transactionList)

	transactions['Date'] = pd.to_datetime(transactions.Date)
	transactions = transactions.reset_index(drop=True)
	transactions['Indexx'] = transactions.index
	transactions.sort_values(by=['Date', 'Indexx'], inplace=True)
	del transactions['Indexx']

	ts = time.time()
	st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H.%M.%S')
	transactions.to_csv('output' + st + '.csv', index=False)
	print('Success!')
	#endregion