Last active
November 16, 2016 15:48
-
-
Save mrmiguez/30c0e52ff4d620b50d9bea62c516fff8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from lxml import etree | |
import csv | |
# saving the TEI namespace makes life easier | |
TEI = '{http://www.tei-c.org/ns/1.0}' | |
# parse XML file | |
tree = etree.parse('1905-07-13.xml') | |
root = tree.getroot() | |
# iterate over all <head> elements with a <div> parent | |
for elem in root.findall('.//{http://www.tei-c.org/ns/1.0}div/{http://www.tei-c.org/ns/1.0}head'): | |
# select a specific <head> with string matching | |
if elem.text == "STOCKS AND SHARES": | |
# go up the tree to the parent <div> | |
for parent in elem.getparent(): | |
# print(parent.sourceline) | |
# search under <div> for <table> | |
for table in parent.getiterator(tag='{*}table'): | |
# open a CSV file for writing | |
with open('1905-07-13.csv', 'w') as csv_file: | |
csv_out = csv.writer(csv_file) | |
# iterate over each <row> in <table> | |
for row in table.iterfind('./{0}row'.format(TEI)): | |
row_contents = [] | |
# iterate each <cell> in <row>, creating a list of text values | |
for cell in row.iterfind('./{0}cell'.format(TEI)): | |
# sometimes text content is in <cell> sometimes deeper. | |
# test for this condition by seeing if <cell> has any children | |
if len(cell) == 0: | |
row_contents.append(cell.text) | |
else: | |
child_text = '' | |
for child_cell in cell.iterchildren(): | |
child_text = child_text + child_cell.text | |
row_contents.append(child_text) | |
# write list of cell text contents to a new row in the CSV file | |
csv_out.writerow(row_contents) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment