Last active
December 21, 2015 10:59
-
-
Save wegrata/6295401 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
import json | |
import urlparse | |
import urllib | |
from pprint import pprint | |
from lxml import etree | |
def process_nsdl_dc(envelope): | |
#parse the resource_data into an XML dom object | |
dom = etree.fromstring(envelope['resource_data']) | |
#dictionary containing XML namespaces and their prefixes | |
dc_namespaces = {"nsdl_dc": "http://ns.nsdl.org/nsdl_dc_v1.02/", | |
"dc": "http://purl.org/dc/elements/1.1/", | |
"dct": "http://purl.org/dc/terms/"} | |
# run an XPath query againt the dom object that pulls out all the document titles | |
titles = dom.xpath('/nsdl_dc:nsdl_dc/dc:title', | |
namespaces=dc_namespaces) | |
# extract a set of all the titles from the DOM elements | |
pprint({elm.text for elm in titles}) | |
def process_lom(envelope): | |
#parse the resource_data into an XML dom object | |
dom = etree.fromstring(envelope['resource_data']) | |
#dictionary containing XML namespaces and their prefixes | |
lom_namespaces = { | |
"lom": "http://ltsc.ieee.org/xsd/LOM" | |
} | |
# run an XPath query againt the dom object that pulls out all the document titles | |
titles = dom.xpath("//lom:lom/lom:general/lom:title/lom:string", | |
namespaces=lom_namespaces) | |
# extract a set of all the titles from the DOM elements | |
pprint({elm.text for elm in titles}) | |
def process_comm_para(envelope): | |
#parse the resource_data into an XML dom object | |
dom = etree.fromstring(envelope['resource_data']) | |
#dictionary containing XML namespaces and their prefixes | |
comm_namespaces = { | |
"comm": "http://ns.nsdl.org/ncs/comm_para", | |
"oai": "http://www.openarchives.org/OAI/2.0/" | |
} | |
# run an XPath query againt the dom object that pulls out all the record IDs | |
record_ids = dom.xpath("//oai:metadata/comm:commParadata/comm:recordId", | |
namespaces=comm_namespaces) | |
# extract a set of all the titles from the DOM elements | |
pprint({elm.text for elm in record_ids}) | |
def process_lrmi(envelope): | |
print('lrmi') | |
names = set() | |
#LRMI is json so no DOM stuff is needed | |
for item in envelope['resource_data']['items']: | |
#add the name(s) from the item to a set | |
names.add(item['name']) | |
# print all the names | |
pprint(names) | |
def process_envelope(envelope): | |
print(envelope['doc_ID']) | |
#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array | |
schemas = {schema.lower() for schema in envelope.get('payload_schema', [])} | |
try: | |
if 'lom' in schemas: | |
process_lom(envelope) | |
elif 'nsdl_dc' in schemas: | |
process_nsdl_dc(envelope) | |
elif 'lrmi' in schemas: | |
process_lrmi(envelope) | |
elif 'comm_para 1.0' in schemas: | |
process_comm_para(envelope) | |
except Exception as ex: | |
print(ex) | |
print("Error In Payload") | |
def harvest(start_url): | |
#start by adding the root URL to the list of urls to harvest from | |
urls = [start_url] | |
#while we have url to harvest from continue | |
while len(urls) > 0: | |
#remove the first URL to pull the LR documents from | |
lr_url = urls.pop() | |
# make an HTTP GET request to the LR harvest interface | |
resp = urllib2.urlopen(lr_url) | |
try: | |
#parse json from the response body | |
data = json.loads(resp.read()) | |
# iterate over the results | |
for i in data['listrecords']: | |
#for the rest of this code we only care about the LR envelope portion of the harvest result | |
envelope = i['record']['resource_data'] | |
# process the envelope | |
process_envelope(envelope) | |
# if there is a resumption token | |
if "resumption_token" in data and \ | |
data['resumption_token'] is not None and \ | |
data['resumption_token'] != "null": | |
#parse the origional URL and update the query string to contain the resumption_token | |
url_parts = urlparse.urlparse(lr_url) | |
new_query = urllib.urlencode({"resumption_token": data['resumption_token']}) | |
next_url = urlparse.urlunparse((url_parts.scheme, | |
url_parts.netloc, | |
url_parts.path, | |
url_parts.params, | |
new_query, | |
url_parts.fragment)) | |
#add the URL for the next page of results to the urls array | |
urls.append(next_url) | |
except Exception as ex: | |
print(ex) | |
print(lr_url) | |
def main(): | |
harvest("https://node01.public.learningregistry.net/harvest/listrecords") | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment