Skip to content

Instantly share code, notes, and snippets.

@bencharb
Last active December 10, 2023 00:53
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.
Save bencharb/13249225206dd2b2979942d98efe87cd to your computer and use it in GitHub Desktop.
parse bank of america pdf statements
'''
Transform Bank of America PDF text into tables.
Example:
python parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv
Example of original PDF text (copy/pasted from PDF):
MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015
Your checking account
Page 3 of 4
Deposits and other additions
Date Description Amount
11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
ID:000299229294 PPD
918.57
12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
ID:000299229294 PPD
918.57
Total deposits and other additions $2,037.14
Withdrawals and other subtractions
Date Description Amount
11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
Total withdrawals and other subtractions -$3,120.00
To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
in Online Banking for a printable version of the How to Balance Your Account Worksheet.
Checks
Date Check # Amount Date Check # Amount
10/14/14 833 -183.00 10/24/14 835 -61.00
10/09/14 833 -9.95
Total checks -$253.95
Total # of checks 3
Note your Ending Balance already reflects the subtraction of Service Fees.
To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
in Online Banking for a printable version of the How to Balance Your Account Worksheet.
Example output:
Date,Description,Amount,Account
11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST,-120.00,testaccount
10/14/14 ,833,-183.0,testaccount
10/24/14 ,835,-61.0,testaccount'
'''
import re
import csv
import shutil
# Meh, cant test now, hard to assert equal.
# def test_transform():
# test_data = '''MISTER NAME | Account # 999 99999 9999 | November 24, 2015 to December 24, 2015
# Your checking account
# Page 3 of 4
# Deposits and other additions
# Date Description Amount
# 11/30/15 Online Banking transfer from CHK 9999 Confirmation# 0986430911 200.00
# 12/07/15 MISTEREMPLOYER, DES:PAYROLL ID:00099999000X INDN:MISTER, NAME
# ID:000299229294 PPD
# 918.57
# 12/22/15 MISTEREMPLOYER, DES:PAYROLL ID: 00099999000X INDN:MISTER, NAME
# ID:000299229294 PPD
# 918.57
# Total deposits and other additions $2,037.14
# Withdrawals and other subtractions
# Date Description Amount
# 11/30/15 BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ACITY ST -120.00
# 12/07/15 Online Banking transfer to CHK 9999 Confirmation# 09810423895 -500.00
# Total withdrawals and other subtractions -$3,120.00
# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
# in Online Banking for a printable version of the How to Balance Your Account Worksheet.
# Checks
# Date Check # Amount Date Check # Amount
# 10/14/14 833 -183.00 10/24/14 835 -61.00
# 10/09/14 833 -9.95
# Total checks -$253.95
# Total # of checks 3
# Note your Ending Balance already reflects the subtraction of Service Fees.
# To help you BALANCE YOUR CHECKING ACCOUNT, visit bankofamerica.com/statementbalance or the Statements and Documents tab
# in Online Banking for a printable version of the How to Balance Your Account Worksheet.'''
# expected = '''Date,Description,Amount,Account
# 11/30/15 ,Online Banking transfer from CHK 9999 Confirmation# 0986430911,200.00,testaccount
# 11/30/15 ,BKOFAMERICA ATM 11/28 #0000000232 WITHDRWL THE MALL ALEXANDRIA VA,-120.00,testaccount
# 10/14/14 ,833,-183.0,testaccount
# 10/24/14 ,835,-61.0,testaccount'''
# # import tempfile
# # tmp_path = os.path.join(tempfile.mkdtemp(),'testaccount')
# # tmp_path_output = tmp_path+'_final_output.csv'
# # tmp_path_output_expected = tmp_path+'_final_output_expected.csv'
# # with open(tmp_path, 'w') as fout:
# # fout.write(test_data)
# # columns = ('Date', 'Description', 'Amount', 'Account',)
# # rowcounts = main([tmp_path], tmp_path_output, columns=columns)
# # with open(tmp_path_output_expected, 'w') as fout:
# # writer = csv.DictWriter(fout, columns)
# # writer.writeheader()
# # writer.writerows(expected.split('\n'))
# # with open(tmp_path_output, 'r') as fin:
# # orig_reader = csv.reader(fin)
# # orig_data = [l for l in orig_reader]
# # with open(tmp_path_output_expected, 'r') as fin:
# # expected_reader = csv.reader(fin)
# # expected_data = [l for l in expected_reader]
# # print 'expected_data', expected_data
# # print 'orig_data', orig_data
# # assert expected_data == orig_data
# # # try:
# # # except AssertionError:
# # # print 'tmp_path_output_expected', tmp_path_output_expected
# # # print 'tmp_path_output', tmp_path_output
# # # raise
def is_true(val):
return unicode(val).lower() in ['true', 'yes']
def validate_line(l):
return unicode(l[0]).isnumeric() and '/' in l.split()[0]
def validate_header_line(l):
ok_len = len(l) < len('Date Check # Amount Date Check # Amount') + 10
valid = ok_len and l.startswith('Date')
return valid
def validate_data(data):
if isinstance(data, basestring):
working_data = data.split('\n')
else:
working_data = data
invalids = []
for line in working_data:
if not validate_line(line):
if not validate_header_line(line):
invalids.append(line)
if invalids:
raise Exception('Invalid lines %s' % '\n'.join(invalids))
return True
def line_to_dict(line):
parts = re.findall(r'(\d+/\d+/\d+)\s(.*)\s(.*\d+)$', line)
if parts:
date, des, amt = parts[0]
return {'Date':date, 'Description':des, 'Amount':amt}
boa_rx = re.compile('\d\d\/\d\d\/\d\d(?:.(?!\nTotal)(?!\d\d\/\d\d\/\d\d)(?!\n\d\d\/\d\d\/\d\d)|\n)+\.\d\d', re.IGNORECASE)
def parse_boa_dump(stringdata):
records = boa_rx.findall(stringdata)
records = [r.replace('\n', ' ') for r in records]
return records
class TransformedStatement(object):
def __init__(self, dump_path, account='unknown'):
super(TransformedStatement, self).__init__()
self.dump_path = dump_path
self.account = account
def write(self, out_path, columns, mode='a'):
fullpath = os.path.abspath(out_path)
with open(out_path, mode) as fout:
writer = csv.DictWriter(fout, columns)
with open(self.dump_path, 'r') as fin:
data = []
def add_account(items, account_name):
for d in items:
d['Account'] = account_name
# remove line endings
file_data = fin.read()
parsed_lines = parse_boa_dump(file_data)
joined_dict = map(line_to_dict, parsed_lines)
add_account(joined_dict, self.account)
writer.writerows(joined_dict)
return len(joined_dict)
def transform_dump(transformed_statements,
columns=None,
out_path='transformed_statements.csv',
override_existing_file=False):
out_path = os.path.abspath(out_path)
exists = os.path.exists(out_path)
if not override_existing_file and exists:
raise Exception('File exists: %s' % out_path)
if isinstance(transformed_statements, TransformedStatement):
transformed_statements = [transformed_statements]
else:
for ts in transformed_statements:
if not isinstance(ts, TransformedStatement):
raise TypeError('%s not of type TransformedStatement' % ts)
out_path_original = out_path
out_path_temp = out_path+'.temp'
with open(out_path_temp, 'w') as fout:
writer = csv.DictWriter(fout, columns)
writer.writeheader()
rowcounts = []
for stmt in transformed_statements:
rowcounts.append(stmt.write(out_path_temp, columns, mode='a'))
shutil.move(out_path_temp, out_path)
return rowcounts
def main(files, output_file, columns=None, override_existing_file=True):
basenames = [os.path.basename(os.path.splitext(f)[0]) for f in files]
source_paths = zip(files,basenames)
stmts = [TransformedStatement(path, account=act) for path, act in source_paths]
rowcounts = transform_dump(stmts, columns=columns, out_path=output_file, override_existing_file=override_existing_file)
return rowcounts
if __name__ == '__main__':
import sys
import argparse
import os
description='Parse copy/pasted text from BofA statements. \nPrerequisites: \n(1) Download BofA pdfs. \n(2) For each PDF, keep only pages with tables on them; trash all other pages without tables. \n(3) For each PDF, copy/paste PDF text content from each statement into one text file. Just Cmd+a/Ctl+a > copy and paste into a file for the account name, such as "checking6443.txt"'
description+='\nExample: \npython parse_boa_statements.py test/joint_dump.txt test/pers_dump.txt --output test/combined.csv\n\n'
import textwrap
description = '\n'.join(['\n'.join(textwrap.wrap(line, 100)) for line in description.split('\n')])
parser = argparse.ArgumentParser(description)
parser.add_argument('file', nargs='*', help='Separate multiple files with a comma. Paths to the files you created for each account. Each file should contain content for only one account. Generate the content for each file by copy/pasting the text from the PDFs into one file. Detailed description elsewhere in this help file.')
parser.add_argument('--output', help='Output path', default='combined_statements.csv')
parser.add_argument('--override_output', help='Override existing output file if it exists', action='store_true', default=True)
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
# files = args.files.split(',')
files = args.file #.split(',')
out_path = args.output
override_existing_file = is_true(args.override_output)
columns = ('Date', 'Description', 'Amount', 'Account',)
rowcounts = main(files, out_path, columns=columns, override_existing_file=override_existing_file)
if not rowcounts:
message = 'No rows transformed to file %s' % out_path
else:
message = '%d rows from %d files transformed to file %s' % (sum(rowcounts),len(rowcounts), out_path,)
print message
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment