WillemJan/parse_xml.py

## parse_xml.py
#!/usr/bin/env python3

import json

from lxml import etree
from pprint import pprint

DEBUG = False
# DEBUG = True

OUTPUT_OCR = False  # There are 3 versions.
OCR_OUTPUT_VERSION = 'ocr'  # post_correction, modernisation, ocr

OUTPUT_ANCESTORS = False # ancestors info?
OUTPUT_NER = False  # Raw ner info.
OUTPUT_OCR_DATE = False  # There are multiple date / time entries.
OUTPUT_PL = False  # Persons and location.
OUTPUT_TITLE = False # Title of article.

OUTPUT_PUBLISHER = False # Publisher info.

filename = '/home/aloha/Downloads/tt/Export_SBIR_ruwe_data_XML_v2.xml'

for event, element in etree.iterparse(filename, huge_tree=True):
    if DEBUG:
        print('DEBUG:', element.attrib)

    if 'name' not in element.attrib:
        continue

    name = element.attrib.get('name')

    if 'folder' in name:
        for i in element.iterchildren():
            folder = i.text

    if 'filesize' in element.attrib:
        print('name: %-30s/%-30s\tsize: %s' % (
            folder,
            element.attrib.get('name').strip(),
            element.attrib.get('filesize')))

    if 'ocr_date' in name or 'title' in name:
        for i in element.iterchildren():
            if OUTPUT_OCR_DATE:
                print('date: ' + i.text)

    if 'title' in name:
        for i in element.iterchildren():
            if OUTPUT_TITLE:
                print('title: ' + i.text)

    if 'publisher' in name:
        for i in element.iterchildren():
            if OUTPUT_PUBLISHER:
                print('publisher: ' + i.text)


    if 'ancestors' in name:
        for i in element.iterchildren():
            ancestors = json.loads(i.text)
        if OUTPUT_ANCESTORS:
            pprint(ancestors)

    if OUTPUT_PL:
        persons = []
        locations = []

        if 'persons' in element.attrib.get('name'):
            for i in element.iterchildren():
                persons.append(i.text)

        if 'locations' in element.attrib.get('name'):
            for i in element.iterchildren():
                locations.append(i.text)

        if persons and locations:
            pprint({'persons': persons,
                    'locations': locations})
        elif persons:
            pprint({'persons': persons})
        elif locations:
            pprint({'locations': locations})

    if 'ner' in element.attrib.get('name'):
        for i in element.iterchildren():
            json_element = json.loads(i.text)
            if OUTPUT_NER:
                pprint(json_element)

    if element.attrib.get('name') == OCR_OUTPUT_VERSION:
        for i in element.iterchildren():
            if OUTPUT_OCR:
                print(i.text)

    element.clear()
	#!/usr/bin/env python3

	import json

	from lxml import etree
	from pprint import pprint

	DEBUG = False
	# DEBUG = True

	OUTPUT_OCR = False # There are 3 versions.
	OCR_OUTPUT_VERSION = 'ocr' # post_correction, modernisation, ocr

	OUTPUT_ANCESTORS = False # ancestors info?
	OUTPUT_NER = False # Raw ner info.
	OUTPUT_OCR_DATE = False # There are multiple date / time entries.
	OUTPUT_PL = False # Persons and location.
	OUTPUT_TITLE = False # Title of article.

	OUTPUT_PUBLISHER = False # Publisher info.

	filename = '/home/aloha/Downloads/tt/Export_SBIR_ruwe_data_XML_v2.xml'

	for event, element in etree.iterparse(filename, huge_tree=True):
	if DEBUG:
	print('DEBUG:', element.attrib)

	if 'name' not in element.attrib:
	continue

	name = element.attrib.get('name')

	if 'folder' in name:
	for i in element.iterchildren():
	folder = i.text

	if 'filesize' in element.attrib:
	print('name: %-30s/%-30s\tsize: %s' % (
	folder,
	element.attrib.get('name').strip(),
	element.attrib.get('filesize')))

	if 'ocr_date' in name or 'title' in name:
	for i in element.iterchildren():
	if OUTPUT_OCR_DATE:
	print('date: ' + i.text)

	if 'title' in name:
	for i in element.iterchildren():
	if OUTPUT_TITLE:
	print('title: ' + i.text)

	if 'publisher' in name:
	for i in element.iterchildren():
	if OUTPUT_PUBLISHER:
	print('publisher: ' + i.text)


	if 'ancestors' in name:
	for i in element.iterchildren():
	ancestors = json.loads(i.text)
	if OUTPUT_ANCESTORS:
	pprint(ancestors)

	if OUTPUT_PL:
	persons = []
	locations = []

	if 'persons' in element.attrib.get('name'):
	for i in element.iterchildren():
	persons.append(i.text)

	if 'locations' in element.attrib.get('name'):
	for i in element.iterchildren():
	locations.append(i.text)

	if persons and locations:
	pprint({'persons': persons,
	'locations': locations})
	elif persons:
	pprint({'persons': persons})
	elif locations:
	pprint({'locations': locations})

	if 'ner' in element.attrib.get('name'):
	for i in element.iterchildren():
	json_element = json.loads(i.text)
	if OUTPUT_NER:
	pprint(json_element)

	if element.attrib.get('name') == OCR_OUTPUT_VERSION:
	for i in element.iterchildren():
	if OUTPUT_OCR:
	print(i.text)

	element.clear()