Skip to content

Instantly share code, notes, and snippets.

@mrmiguez
Created March 17, 2017 17:28
Show Gist options
  • Save mrmiguez/068a1d49daf5bbdbbddedef0887f5c97 to your computer and use it in GitHub Desktop.
Save mrmiguez/068a1d49daf5bbdbbddedef0887f5c97 to your computer and use it in GitHub Desktop.
import requests
'''
# requests - http://docs.python-requests.org/en/master/
# the simplest HTTP GET/POST utility I've found
'''
from lxml import etree
'''
# http://lxml.de
# an XML library written in C
'''
NS = { None: "http://www.loc.gov/mads/v2",
"mads": "http://www.loc.gov/mads/v2",
"xlink": "http://www.w3.org/1999/xlink",
"xsi": "http://www.w3.org/2001/XMLSchema-instance" }
# above we're storing all the namespace definitions we'll need as a python dictionary
in_file = open('exURIs.txt', 'r') # create and open a file object. the 'r' flag is for read
with open('exMADSCollection.xml', 'w') as out_file: # create another file object, this time in 'w'=write mode
madsRoot = etree.Element('{http://www.loc.gov/mads/v2}madsCollection', # use the lxml.etree.Element method to create our root
version="2.0", # attributes can be added as keyword=value arguments
nsmap=NS) # apply all namespaces from our dict to this element
for uri in in_file: # using our in_file object, let's iterate over each line in the file
MADSrequest = requests.get(uri.rstrip('\r\n') + '.madsxml.xml') # str.rstrip cleans up any errant newline characters
'''
# here we make the actual http request
# since many types of files live at our URI
# we just add the extension of the one we
# want to the end, i.e. +'.html', +'.json',
# +'.rdf', etc.
'''
if MADSrequest.status_code == 200: # check if we actually got something by HTTP status code
madsChild = etree.XML(MADSrequest.text) # parse the received text as XML
madsRoot.append(madsChild) # lxml stores parent/child relationships as lists.
# to add our new XML as a child to root, we use
# use the list.append() method
out_file.write(etree.tostring(madsRoot, # write to our out_file object the results of lxml.etree.tostring passing in our root element
pretty_print=True,
xml_declaration=True,
encoding="UTF-8").decode('utf-8')) # let's overkill it on the utf-8... b/c I really want it to be utf-8 encoded
in_file.close()
'''
# the very first thing we did was open in_file. it's good practice to close
# everything you open.
# the out_file object does not need to be closed. sinced we called it in a
# while loop, python will close it automatically when the loop is exited
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment