the-spectator/requirements.txt

## requirements.txt
camelot-py==0.11.0
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
click==8.1.7
cryptography==42.0.5
et-xmlfile==1.1.0
ghostscript==0.7
numpy==1.26.4
opencv-python==4.9.0.80
openpyxl==3.1.2
pandas==2.2.1
pdfminer.six==20231228
pycparser==2.21
pypdf==4.1.0
python-dateutil==2.9.0.post0
pytz==2024.1
six==1.16.0
tabulate==0.9.0
tzdata==2024.1

## statement-to-excel.py
# This script is designed to convert bank statements from pdf to excel.
#
# It has been tweaked on HDFC Bank Credit Card statements,
# but in theory you can use it on any PDF document.
#
# The script depends on camelot-py,
# which can be installed using pip
#
# pip install "camelot-py[cv]"

import os
import argparse
import camelot
import pandas as pd
from collections import defaultdict

def extract_df(path, password=None):
    # The default values from pdfminer are M = 2.0, W = 0.1 and L = 0.5
    laparams = {'char_margin': 2.0, 'word_margin': 0.2, 'line_margin': 1.0}

    # Extract all tables using the lattice algorithm
    lattice_tables = camelot.read_pdf(path, password=password,
        pages='all', flavor='lattice', line_scale=50, layout_kwargs=laparams)

    # Extract bounding boxes
    regions = defaultdict(list)
    for table in lattice_tables:
        bbox = [table._bbox[i] for i in [0, 3, 2, 1]]
        regions[table.page].append(bbox)

    df = pd.DataFrame()

    # Extract tables using the stream algorithm
    lst = []
    for page, boxes in regions.items():
        areas = [','.join([str(int(x)) for x in box]) for box in boxes]
        stream_tables = camelot.read_pdf(path, password=password, pages=str(page),
            flavor='stream', table_areas=areas, row_tol=5, layout_kwargs=laparams)
        dataframes = [table.df for table in stream_tables]
        dataframes = pd.concat(dataframes)
        lst.append(dataframes)

    df = pd.concat(lst)
    return df


def main(args):
    for file_name in os.listdir(args.in_dir):
        root, ext = os.path.splitext(file_name)
        if ext.lower() != '.pdf':
            continue
        pdf_path = os.path.join(args.in_dir, file_name)
        print(f'Processing: {pdf_path}')
        df = extract_df(pdf_path, args.password)
        excel_name = root + '.xlsx'
        excel_path = os.path.join(args.out_dir, excel_name)
        df.to_excel(excel_path)
        print(f'Processed : {excel_path}')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--in-dir', type=str, required=True, help='directory to read statement PDFs from.')
    parser.add_argument('--out-dir', type=str, required=True, help='directory to store statement XLSX to.')
    parser.add_argument('--password', type=str, default=None, help='password for the statement PDF.')
    args = parser.parse_args()

    main(args)
	camelot-py==0.11.0
	cffi==1.16.0
	chardet==5.2.0
	charset-normalizer==3.3.2
	click==8.1.7
	cryptography==42.0.5
	et-xmlfile==1.1.0
	ghostscript==0.7
	numpy==1.26.4
	opencv-python==4.9.0.80
	openpyxl==3.1.2
	pandas==2.2.1
	pdfminer.six==20231228
	pycparser==2.21
	pypdf==4.1.0
	python-dateutil==2.9.0.post0
	pytz==2024.1
	six==1.16.0
	tabulate==0.9.0
	tzdata==2024.1
	# This script is designed to convert bank statements from pdf to excel.
	#
	# It has been tweaked on HDFC Bank Credit Card statements,
	# but in theory you can use it on any PDF document.
	#
	# The script depends on camelot-py,
	# which can be installed using pip
	#
	# pip install "camelot-py[cv]"

	import os
	import argparse
	import camelot
	import pandas as pd
	from collections import defaultdict

	def extract_df(path, password=None):
	# The default values from pdfminer are M = 2.0, W = 0.1 and L = 0.5
	laparams = {'char_margin': 2.0, 'word_margin': 0.2, 'line_margin': 1.0}

	# Extract all tables using the lattice algorithm
	lattice_tables = camelot.read_pdf(path, password=password,
	pages='all', flavor='lattice', line_scale=50, layout_kwargs=laparams)

	# Extract bounding boxes
	regions = defaultdict(list)
	for table in lattice_tables:
	bbox = [table._bbox[i] for i in [0, 3, 2, 1]]
	regions[table.page].append(bbox)

	df = pd.DataFrame()

	# Extract tables using the stream algorithm
	lst = []
	for page, boxes in regions.items():
	areas = [','.join([str(int(x)) for x in box]) for box in boxes]
	stream_tables = camelot.read_pdf(path, password=password, pages=str(page),
	flavor='stream', table_areas=areas, row_tol=5, layout_kwargs=laparams)
	dataframes = [table.df for table in stream_tables]
	dataframes = pd.concat(dataframes)
	lst.append(dataframes)

	df = pd.concat(lst)
	return df


	def main(args):
	for file_name in os.listdir(args.in_dir):
	root, ext = os.path.splitext(file_name)
	if ext.lower() != '.pdf':
	continue
	pdf_path = os.path.join(args.in_dir, file_name)
	print(f'Processing: {pdf_path}')
	df = extract_df(pdf_path, args.password)
	excel_name = root + '.xlsx'
	excel_path = os.path.join(args.out_dir, excel_name)
	df.to_excel(excel_path)
	print(f'Processed : {excel_path}')


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--in-dir', type=str, required=True, help='directory to read statement PDFs from.')
	parser.add_argument('--out-dir', type=str, required=True, help='directory to store statement XLSX to.')
	parser.add_argument('--password', type=str, default=None, help='password for the statement PDF.')
	args = parser.parse_args()

	main(args)