Skip to content

Instantly share code, notes, and snippets.

@artkpv
Last active September 16, 2019 09:40
Show Gist options
  • Save artkpv/28982b2ba7af373607e6fcb21b099a91 to your computer and use it in GitHub Desktop.
Save artkpv/28982b2ba7af373607e6fcb21b099a91 to your computer and use it in GitHub Desktop.
Convert PDF to CSV for Rocketbank (Рокетбанк)
#!/bin/python3
"""
Конвертирует транзакции Рокетбанка (https://rocketbank.ru) из PDF в CSV.
Выводит в UTF8.
Зависимости:
- Poppler. В PATH: 'pdftotext'
https://poppler.freedesktop.org/
https://jlk.fjfi.cvut.cz/arch/manpages/man/pdftotext.1
Автор: w1ld at inbox dot ru
Releases:
- Before 2019-09-16. Using SumatraPDf
- 2019-09-16. Using Poppler. Adding checks for a day. Output to STDOUT.
"""
import sys
import re
import csv
from decimal import Decimal
from subprocess import getoutput
PDFTOTEXTCLI = 'pdftotext'
if len(sys.argv) == 0:
print('Usage: import-rocket-pdftocsv.py [pdf]')
exit()
def parse_amount(amount_raw):
assert amount_raw
amount = amount_raw.replace(' ', '')
amount = amount.replace(',', '.')
return Decimal(amount)
# Construct re patterns.
currency_p = '[A-Z]{1,4}'
amount_p = '-?\d[\d ]*(,\d+)? ' + currency_p
def parse_amount_currency(amount_currency):
*amount_vals, currency = amount_currency.split(' ')
amount = parse_amount(' '.join(amount_vals))
return amount, currency
transaction_p_base = r'(?m)^$^\n( +)(?P<d1>\S[^\n]+\S) {{2,}}(?P<a>{})(?P<d2>(\n\1\S[^\n]*)*)\n^$'
transaction_re = re.compile(transaction_p_base.format(amount_p))
table_header_re = re.compile(r'\s*Дата\s+Описание\s+Расход\s+Приход\s+Входящий остаток\s*')
table_bottom_re = re.compile(r'^\s*Итог:\s+(?P<income>' + amount_p + ')\s+(?P<outcom>' + amount_p + ')\s*$')
date_re = re.compile('(?m)^(?P<date>\d{2}\.\d{2}\.\d{4})\s+(?P<d_amount>' + amount_p + ')$')
total_in_re = re.compile('Входящий остаток:\s+(' + amount_p + ')')
total_out_re = re.compile('Исходящий остаток:\s+(' + amount_p + ')')
def _transaction_iterator(pdftext):
# Iterate from table header till table bottom by a date.
th_match = table_header_re.search(pdftext)
assert th_match
from_ = th_match.end(0)
date_m = date_re.search(pdftext, from_)
total_in_m = total_in_re.search(pdftext)
assert total_in_m
balance_amount, balance_currency = parse_amount_currency(total_in_m.group(1))
total_out_m = total_out_re.search(pdftext)
assert total_out_m
balance_out_amount, balance_out_currency = parse_amount_currency(total_out_m.group(1))
while date_m:
date_balance, date_currency = parse_amount_currency(date_m.groupdict()['d_amount'])
if balance_amount != date_balance:
raise Exception('Invalid balance: {} != {}.'.format(balance_amount, date_balance))
from_ = date_m.start(0)
next_date_m = date_re.search(pdftext, date_m.end(0))
to_ = next_date_m.start(0) if next_date_m else len(pdftext)-1
date_val = date_m.groupdict()['date']
# Iterate over all transactions in this day.
for transaction_m in transaction_re.finditer(pdftext, pos=from_, endpos=to_):
mdict = transaction_m.groupdict()
description = mdict['d1']
amount, currency = parse_amount_currency(mdict['a'])
assert currency == balance_currency
balance_amount += amount # For checks. Expense comes with '-'. Income is positive.
d_remain = mdict['d2']
if d_remain:
d_remain = d_remain.strip()
# Concat:
d_remain = re.sub(r' {2,}', r' ', d_remain)
description += ' ' + d_remain
description = description.replace('\n', '')
yield [date_val, description, amount, currency, balance_amount]
date_m = next_date_m
# Check balance for last day.
if balance_amount != balance_out_amount:
raise Exception('Invalid balance: {} != {}'.format(balance_amount, balance_out_amount))
pdffilename = sys.argv[1]
pdftext = getoutput(PDFTOTEXTCLI + ' -layout "' + pdffilename + '" -')
assert pdftext
with sys.stdout as f:
w = csv.writer(f)
w.writerow(['date','description','amount', 'currency', 'balance'])
for trn in _transaction_iterator(pdftext):
w.writerow(trn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment