Skip to content

Instantly share code, notes, and snippets.

@WillemJan
Created August 23, 2021 07:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save WillemJan/89745ba46ddce20fa13075675704c364 to your computer and use it in GitHub Desktop.
Save WillemJan/89745ba46ddce20fa13075675704c364 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import json
from lxml import etree
from pprint import pprint
DEBUG = False
# DEBUG = True
OUTPUT_OCR = False # There are 3 versions.
OCR_OUTPUT_VERSION = 'ocr' # post_correction, modernisation, ocr
OUTPUT_ANCESTORS = False # ancestors info?
OUTPUT_NER = False # Raw ner info.
OUTPUT_OCR_DATE = False # There are multiple date / time entries.
OUTPUT_PL = False # Persons and location.
OUTPUT_TITLE = False # Title of article.
OUTPUT_PUBLISHER = False # Publisher info.
filename = '/home/aloha/Downloads/tt/Export_SBIR_ruwe_data_XML_v2.xml'
for event, element in etree.iterparse(filename, huge_tree=True):
if DEBUG:
print('DEBUG:', element.attrib)
if 'name' not in element.attrib:
continue
name = element.attrib.get('name')
if 'folder' in name:
for i in element.iterchildren():
folder = i.text
if 'filesize' in element.attrib:
print('name: %-30s/%-30s\tsize: %s' % (
folder,
element.attrib.get('name').strip(),
element.attrib.get('filesize')))
if 'ocr_date' in name or 'title' in name:
for i in element.iterchildren():
if OUTPUT_OCR_DATE:
print('date: ' + i.text)
if 'title' in name:
for i in element.iterchildren():
if OUTPUT_TITLE:
print('title: ' + i.text)
if 'publisher' in name:
for i in element.iterchildren():
if OUTPUT_PUBLISHER:
print('publisher: ' + i.text)
if 'ancestors' in name:
for i in element.iterchildren():
ancestors = json.loads(i.text)
if OUTPUT_ANCESTORS:
pprint(ancestors)
if OUTPUT_PL:
persons = []
locations = []
if 'persons' in element.attrib.get('name'):
for i in element.iterchildren():
persons.append(i.text)
if 'locations' in element.attrib.get('name'):
for i in element.iterchildren():
locations.append(i.text)
if persons and locations:
pprint({'persons': persons,
'locations': locations})
elif persons:
pprint({'persons': persons})
elif locations:
pprint({'locations': locations})
if 'ner' in element.attrib.get('name'):
for i in element.iterchildren():
json_element = json.loads(i.text)
if OUTPUT_NER:
pprint(json_element)
if element.attrib.get('name') == OCR_OUTPUT_VERSION:
for i in element.iterchildren():
if OUTPUT_OCR:
print(i.text)
element.clear()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment