Created
August 23, 2021 07:05
-
-
Save WillemJan/89745ba46ddce20fa13075675704c364 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
from lxml import etree | |
from pprint import pprint | |
DEBUG = False | |
# DEBUG = True | |
OUTPUT_OCR = False # There are 3 versions. | |
OCR_OUTPUT_VERSION = 'ocr' # post_correction, modernisation, ocr | |
OUTPUT_ANCESTORS = False # ancestors info? | |
OUTPUT_NER = False # Raw ner info. | |
OUTPUT_OCR_DATE = False # There are multiple date / time entries. | |
OUTPUT_PL = False # Persons and location. | |
OUTPUT_TITLE = False # Title of article. | |
OUTPUT_PUBLISHER = False # Publisher info. | |
filename = '/home/aloha/Downloads/tt/Export_SBIR_ruwe_data_XML_v2.xml' | |
for event, element in etree.iterparse(filename, huge_tree=True): | |
if DEBUG: | |
print('DEBUG:', element.attrib) | |
if 'name' not in element.attrib: | |
continue | |
name = element.attrib.get('name') | |
if 'folder' in name: | |
for i in element.iterchildren(): | |
folder = i.text | |
if 'filesize' in element.attrib: | |
print('name: %-30s/%-30s\tsize: %s' % ( | |
folder, | |
element.attrib.get('name').strip(), | |
element.attrib.get('filesize'))) | |
if 'ocr_date' in name or 'title' in name: | |
for i in element.iterchildren(): | |
if OUTPUT_OCR_DATE: | |
print('date: ' + i.text) | |
if 'title' in name: | |
for i in element.iterchildren(): | |
if OUTPUT_TITLE: | |
print('title: ' + i.text) | |
if 'publisher' in name: | |
for i in element.iterchildren(): | |
if OUTPUT_PUBLISHER: | |
print('publisher: ' + i.text) | |
if 'ancestors' in name: | |
for i in element.iterchildren(): | |
ancestors = json.loads(i.text) | |
if OUTPUT_ANCESTORS: | |
pprint(ancestors) | |
if OUTPUT_PL: | |
persons = [] | |
locations = [] | |
if 'persons' in element.attrib.get('name'): | |
for i in element.iterchildren(): | |
persons.append(i.text) | |
if 'locations' in element.attrib.get('name'): | |
for i in element.iterchildren(): | |
locations.append(i.text) | |
if persons and locations: | |
pprint({'persons': persons, | |
'locations': locations}) | |
elif persons: | |
pprint({'persons': persons}) | |
elif locations: | |
pprint({'locations': locations}) | |
if 'ner' in element.attrib.get('name'): | |
for i in element.iterchildren(): | |
json_element = json.loads(i.text) | |
if OUTPUT_NER: | |
pprint(json_element) | |
if element.attrib.get('name') == OCR_OUTPUT_VERSION: | |
for i in element.iterchildren(): | |
if OUTPUT_OCR: | |
print(i.text) | |
element.clear() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment