Skip to content

Instantly share code, notes, and snippets.

@wegrata
Last active December 21, 2015 10:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wegrata/6295401 to your computer and use it in GitHub Desktop.
Save wegrata/6295401 to your computer and use it in GitHub Desktop.
import urllib2
import json
import urlparse
import urllib
from pprint import pprint
from lxml import etree
def process_nsdl_dc(envelope):
#parse the resource_data into an XML dom object
dom = etree.fromstring(envelope['resource_data'])
#dictionary containing XML namespaces and their prefixes
dc_namespaces = {"nsdl_dc": "http://ns.nsdl.org/nsdl_dc_v1.02/",
"dc": "http://purl.org/dc/elements/1.1/",
"dct": "http://purl.org/dc/terms/"}
# run an XPath query againt the dom object that pulls out all the document titles
titles = dom.xpath('/nsdl_dc:nsdl_dc/dc:title',
namespaces=dc_namespaces)
# extract a set of all the titles from the DOM elements
pprint({elm.text for elm in titles})
def process_lom(envelope):
#parse the resource_data into an XML dom object
dom = etree.fromstring(envelope['resource_data'])
#dictionary containing XML namespaces and their prefixes
lom_namespaces = {
"lom": "http://ltsc.ieee.org/xsd/LOM"
}
# run an XPath query againt the dom object that pulls out all the document titles
titles = dom.xpath("//lom:lom/lom:general/lom:title/lom:string",
namespaces=lom_namespaces)
# extract a set of all the titles from the DOM elements
pprint({elm.text for elm in titles})
def process_comm_para(envelope):
#parse the resource_data into an XML dom object
dom = etree.fromstring(envelope['resource_data'])
#dictionary containing XML namespaces and their prefixes
comm_namespaces = {
"comm": "http://ns.nsdl.org/ncs/comm_para",
"oai": "http://www.openarchives.org/OAI/2.0/"
}
# run an XPath query againt the dom object that pulls out all the record IDs
record_ids = dom.xpath("//oai:metadata/comm:commParadata/comm:recordId",
namespaces=comm_namespaces)
# extract a set of all the titles from the DOM elements
pprint({elm.text for elm in record_ids})
def process_lrmi(envelope):
print('lrmi')
names = set()
#LRMI is json so no DOM stuff is needed
for item in envelope['resource_data']['items']:
#add the name(s) from the item to a set
names.add(item['name'])
# print all the names
pprint(names)
def process_envelope(envelope):
print(envelope['doc_ID'])
#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
try:
if 'lom' in schemas:
process_lom(envelope)
elif 'nsdl_dc' in schemas:
process_nsdl_dc(envelope)
elif 'lrmi' in schemas:
process_lrmi(envelope)
elif 'comm_para 1.0' in schemas:
process_comm_para(envelope)
except Exception as ex:
print(ex)
print("Error In Payload")
def harvest(start_url):
#start by adding the root URL to the list of urls to harvest from
urls = [start_url]
#while we have url to harvest from continue
while len(urls) > 0:
#remove the first URL to pull the LR documents from
lr_url = urls.pop()
# make an HTTP GET request to the LR harvest interface
resp = urllib2.urlopen(lr_url)
try:
#parse json from the response body
data = json.loads(resp.read())
# iterate over the results
for i in data['listrecords']:
#for the rest of this code we only care about the LR envelope portion of the harvest result
envelope = i['record']['resource_data']
# process the envelope
process_envelope(envelope)
# if there is a resumption token
if "resumption_token" in data and \
data['resumption_token'] is not None and \
data['resumption_token'] != "null":
#parse the origional URL and update the query string to contain the resumption_token
url_parts = urlparse.urlparse(lr_url)
new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
next_url = urlparse.urlunparse((url_parts.scheme,
url_parts.netloc,
url_parts.path,
url_parts.params,
new_query,
url_parts.fragment))
#add the URL for the next page of results to the urls array
urls.append(next_url)
except Exception as ex:
print(ex)
print(lr_url)
def main():
harvest("https://node01.public.learningregistry.net/harvest/listrecords")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment