Skip to content

Instantly share code, notes, and snippets.

@mrmiguez
Last active November 16, 2016 15:48
Show Gist options
  • Save mrmiguez/30c0e52ff4d620b50d9bea62c516fff8 to your computer and use it in GitHub Desktop.
Save mrmiguez/30c0e52ff4d620b50d9bea62c516fff8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
from lxml import etree
import csv
# saving the TEI namespace makes life easier
TEI = '{http://www.tei-c.org/ns/1.0}'
# parse XML file
tree = etree.parse('1905-07-13.xml')
root = tree.getroot()
# iterate over all <head> elements with a <div> parent
for elem in root.findall('.//{http://www.tei-c.org/ns/1.0}div/{http://www.tei-c.org/ns/1.0}head'):
# select a specific <head> with string matching
if elem.text == "STOCKS AND SHARES":
# go up the tree to the parent <div>
for parent in elem.getparent():
# print(parent.sourceline)
# search under <div> for <table>
for table in parent.getiterator(tag='{*}table'):
# open a CSV file for writing
with open('1905-07-13.csv', 'w') as csv_file:
csv_out = csv.writer(csv_file)
# iterate over each <row> in <table>
for row in table.iterfind('./{0}row'.format(TEI)):
row_contents = []
# iterate each <cell> in <row>, creating a list of text values
for cell in row.iterfind('./{0}cell'.format(TEI)):
# sometimes text content is in <cell> sometimes deeper.
# test for this condition by seeing if <cell> has any children
if len(cell) == 0:
row_contents.append(cell.text)
else:
child_text = ''
for child_cell in cell.iterchildren():
child_text = child_text + child_cell.text
row_contents.append(child_text)
# write list of cell text contents to a new row in the CSV file
csv_out.writerow(row_contents)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment