Created
March 17, 2017 17:28
-
-
Save mrmiguez/068a1d49daf5bbdbbddedef0887f5c97 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
''' | |
# requests - http://docs.python-requests.org/en/master/ | |
# the simplest HTTP GET/POST utility I've found | |
''' | |
from lxml import etree | |
''' | |
# http://lxml.de | |
# an XML library written in C | |
''' | |
NS = { None: "http://www.loc.gov/mads/v2", | |
"mads": "http://www.loc.gov/mads/v2", | |
"xlink": "http://www.w3.org/1999/xlink", | |
"xsi": "http://www.w3.org/2001/XMLSchema-instance" } | |
# above we're storing all the namespace definitions we'll need as a python dictionary | |
in_file = open('exURIs.txt', 'r') # create and open a file object. the 'r' flag is for read | |
with open('exMADSCollection.xml', 'w') as out_file: # create another file object, this time in 'w'=write mode | |
madsRoot = etree.Element('{http://www.loc.gov/mads/v2}madsCollection', # use the lxml.etree.Element method to create our root | |
version="2.0", # attributes can be added as keyword=value arguments | |
nsmap=NS) # apply all namespaces from our dict to this element | |
for uri in in_file: # using our in_file object, let's iterate over each line in the file | |
MADSrequest = requests.get(uri.rstrip('\r\n') + '.madsxml.xml') # str.rstrip cleans up any errant newline characters | |
''' | |
# here we make the actual http request | |
# since many types of files live at our URI | |
# we just add the extension of the one we | |
# want to the end, i.e. +'.html', +'.json', | |
# +'.rdf', etc. | |
''' | |
if MADSrequest.status_code == 200: # check if we actually got something by HTTP status code | |
madsChild = etree.XML(MADSrequest.text) # parse the received text as XML | |
madsRoot.append(madsChild) # lxml stores parent/child relationships as lists. | |
# to add our new XML as a child to root, we use | |
# use the list.append() method | |
out_file.write(etree.tostring(madsRoot, # write to our out_file object the results of lxml.etree.tostring passing in our root element | |
pretty_print=True, | |
xml_declaration=True, | |
encoding="UTF-8").decode('utf-8')) # let's overkill it on the utf-8... b/c I really want it to be utf-8 encoded | |
in_file.close() | |
''' | |
# the very first thing we did was open in_file. it's good practice to close | |
# everything you open. | |
# the out_file object does not need to be closed. sinced we called it in a | |
# while loop, python will close it automatically when the loop is exited | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment