mrmiguez/TEItoCSV-test.py

## TEItoCSV-test.py
#!/usr/bin/env python3

from lxml import etree
import csv

# saving the TEI namespace makes life easier
TEI = '{http://www.tei-c.org/ns/1.0}'

# parse XML file
tree = etree.parse('1905-07-13.xml')
root = tree.getroot()

# iterate over all <head> elements with a <div> parent
for elem in root.findall('.//{http://www.tei-c.org/ns/1.0}div/{http://www.tei-c.org/ns/1.0}head'):

    # select a specific <head> with string matching
    if elem.text == "STOCKS AND SHARES":

        # go up the tree to the parent <div>
        for parent in elem.getparent():
#            print(parent.sourceline)

            # search under <div> for <table>
            for table in parent.getiterator(tag='{*}table'):

                # open a CSV file for writing
                with open('1905-07-13.csv', 'w') as csv_file:
                    csv_out = csv.writer(csv_file)

                    # iterate over each <row> in <table>
                    for row in table.iterfind('./{0}row'.format(TEI)):
                        row_contents = []

                        # iterate each <cell> in <row>, creating a list of text values
                        for cell in row.iterfind('./{0}cell'.format(TEI)):

                            # sometimes text content is in <cell> sometimes deeper.
                            # test for this condition by seeing if <cell> has any children
                            if len(cell) == 0:
                                row_contents.append(cell.text)
                            else:
                                child_text = ''
                                for child_cell in cell.iterchildren():
                                    child_text = child_text + child_cell.text
                                row_contents.append(child_text)

                        # write list of cell text contents to a new row in the CSV file
                        csv_out.writerow(row_contents)
	#!/usr/bin/env python3

	from lxml import etree
	import csv

	# saving the TEI namespace makes life easier
	TEI = '{http://www.tei-c.org/ns/1.0}'

	# parse XML file
	tree = etree.parse('1905-07-13.xml')
	root = tree.getroot()

	# iterate over all <head> elements with a <div> parent
	for elem in root.findall('.//{http://www.tei-c.org/ns/1.0}div/{http://www.tei-c.org/ns/1.0}head'):

	# select a specific <head> with string matching
	if elem.text == "STOCKS AND SHARES":

	# go up the tree to the parent <div>
	for parent in elem.getparent():
	# print(parent.sourceline)

	# search under <div> for <table>
	for table in parent.getiterator(tag='{*}table'):

	# open a CSV file for writing
	with open('1905-07-13.csv', 'w') as csv_file:
	csv_out = csv.writer(csv_file)

	# iterate over each <row> in <table>
	for row in table.iterfind('./{0}row'.format(TEI)):
	row_contents = []

	# iterate each <cell> in <row>, creating a list of text values
	for cell in row.iterfind('./{0}cell'.format(TEI)):

	# sometimes text content is in <cell> sometimes deeper.
	# test for this condition by seeing if <cell> has any children
	if len(cell) == 0:
	row_contents.append(cell.text)
	else:
	child_text = ''
	for child_cell in cell.iterchildren():
	child_text = child_text + child_cell.text
	row_contents.append(child_text)

	# write list of cell text contents to a new row in the CSV file
	csv_out.writerow(row_contents)