Instantly share code, notes, and snippets.

anonymous /convert.sh
Created Jun 4, 2014

Embed
What would you like to do?
Denkmalliste Erfurt PDF Scrape
http://www.erfurt.de/mam/ef/rathaus/buergerservice/doc/denkmalliste.pdf
pdftohtml -nodrm -xml -i -f 4 denkmalliste.pdf denkmalliste.xml
from lxml import etree
import pandas as pd
columns = ['strassenname', 'h_nr', 'h_nr_z', 'objektbezeichnung', 'eigenname_erlaeuterung', 'gem_krz', 'flur', 'flst_z', 'flst_n', 'stadtteil_krz', 'eKD_ENS', 'seite']
tree = etree.parse('denkmalliste.xml')
root = tree.getroot()
entries = []
for page in root.findall('page'):
page_no = int(page.get('number'))
e = {}
e['seite'] = page_no
old_left = 0
for te in page.findall('text'):
left = int(te.get('left'))
text = te.text
if text == None:
continue
if left == 1080:
continue
if left < old_left:
#print e
entries.append(e)
# new entry
old_left = 0
e = {}
e['seite'] = page_no
#print left, text
if left == 1013:
e['eKD_ENS'] = text
if left == 972:
e['stadtteil_krz'] = text
if left == 960 or left == 947:
tmp = text.split()
e['flst_n'] = tmp[0]
if len(tmp) > 1:
e['stadtteil_krz'] = tmp[1]
elif left == 917 or left == 911:
e['flst_z'] = text
elif left == 884:
e['flur'] = text
elif left == 818:
e['gem_krz'] = text
elif left == 585:
e['eigenname_erlaeuterung'] = text
elif left == 400:
e['objektbezeichnung'] = text
elif left == 362 or left == 355:
tmp = text.split()
e['h_nr'] = tmp[0]
if len(tmp) > 1:
e['h_nr_z'] = tmp[1]
elif left == 223:
e['strassenname'] = text
old_left = left
df = pd.DataFrame(data=entries, columns=columns)
df.to_csv('denkmalliste.csv', encoding='utf8', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment