Skip to content

Instantly share code, notes, and snippets.

@djpillen
Last active August 29, 2015 14:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save djpillen/380a614136411c7bd2d5 to your computer and use it in GitHub Desktop.
Save djpillen/380a614136411c7bd2d5 to your computer and use it in GitHub Desktop.
Output the text of all <unitdate> tags that are not 'undated' and do not have a 'normal' attribute
import csv
import lxml
from lxml import etree
import os
from os.path import join
import re
path = '/path/to/EADs' # Enter the path to your EAD directory
for filename in os.listdir(path): # Loop through the files in the directory
tree = etree.parse(join(path, filename))
dates = tree.xpath('//unitdate') # xpath that will find a <unitdate> tag anywhere in the EAD
undated = re.compile('^[Uu](ndated)$') # Regular expression to match 'undated' or 'Undated'
for date in dates:
if date.text and not undated.match(date.text) and not 'normal' in date.attrib: # Check if the text of the date matched 'undated' and if the date has an normal attribute
with open('non-normalized_dates.csv', 'ab') as csvfile: # Open a csv for writing our output
writer = csv.writer(csvfile, dialect='excel')
writer.writerow([filename, tree.getpath(date), date.text.encode('utf-8')]) # Write the filename, xpath to the date, and text of the date to the csv
print filename
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment