Skip to content

Instantly share code, notes, and snippets.

@sergray
Created October 29, 2013 12:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergray/7213857 to your computer and use it in GitHub Desktop.
Save sergray/7213857 to your computer and use it in GitHub Desktop.
Python PDF utililties
# coding=utf8
"""
Генератор CSV отчета из PDF отчетов банка Авангард о поступлении средств на
транзитный валютный счет.
По умолчанию читает из стандартного ввода данные извлеченные из PDF при помощи
textpdf.py утилиты.
Генерируемый CSV отчет печатается в стандартный вывод и включает следующие колонки:
* date - дата получения средств
* doc - номер уведомления
* usd - полученная сумма в долларах
* xchg - курс обмена на дату получения
* rub - полученная сумма в рублях
"""
import csv
import re
from decimal import Decimal
ACCOUNT = '' # заменить на свой номер счета \d{21}\s{1}\d{2}
rx = re.compile(
"(?P<prefix>\d+)\t(?P<date>\d{2}/\d{2}/\d{2}) (?P<doc>\d+)\t" + \
ACCOUNT + "\t(?P<dollars>[^0][\d ]+\.\d{2})"
)
def main(src):
out = csv.writer(sys.stdout)
delim = '\t'
decimal_value = lambda str: Decimal(str.replace(' ', ''))
extract_field = lambda line, col: line.split(delim)[col]
rows = []
for line in src:
mo = rx.match(line)
if not mo:
continue
src.next() # пропуск строки "Итого"
dollars = decimal_value(extract_field(src.next(), 2))
rubles = decimal_value(extract_field(src.next(), 2))
exch_rate = decimal_value(extract_field(src.next(), 2))
# sanity checks
assert(decimal_value(mo.group('dollars')) == dollars)
calc_rub = exch_rate * dollars
assert(calc_rub.quantize(Decimal('0.01')) == rubles)
rows.append([mo.group('date'), mo.group('doc'), dollars, exch_rate, rubles])
if rows:
out.writerow(['date', 'doc', 'usd', 'xchg', 'rub'])
out.writerows(rows)
if __name__ == '__main__':
import sys
main(sys.stdin)
"""
Prints inline text boxes from PDF using pdfminer.
Assumes that PDF is autogenerated and inline boxes are on the same height and have the same font size/style.
Requires pdf2txt.py from pdfminer and lxml, so please do::
pip install pdfminer lxml
Usage example::
pdf2txt.py -t xml input.pdf | python text_pdf.py
"""
from __future__ import print_function
from lxml import etree
from itertools import groupby
def line2box(textline):
left, top, right, bottom = map(float, textline.get('bbox').split(','))
word = u''.join(c.text for c in textline.getchildren())
return -top, left, word
def process(page):
textlines = page.xpath('.//textline')
boxes = map(line2box, textlines)
boxes.sort()
page_id = page.get('id')
for k, g in groupby(boxes, lambda b: b[0]):
print(page_id, *[b[2].rstrip().encode('utf-8') for b in g], sep='\t')
def main(src):
tree = etree.parse(src)
pages = tree.xpath('//page')
for page in pages:
process(page)
if __name__ == '__main__':
import sys
main(sys.stdin)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment