Last active
December 23, 2015 07:09
-
-
Save hbarrington/6598468 to your computer and use it in GitHub Desktop.
Learning Registry Harvest example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#based on documentation from https://github.com/LearningRegistry/LearningRegistry/wiki/Consuming-Learning-Registry-Records | |
import urllib2 | |
import urllib | |
import json | |
import urlparse | |
from pprint import pprint | |
def harvest(start_url): | |
#start by adding the root URL to the list of urls to harvest from | |
urls = [start_url] | |
#while we have url to harvest from continue | |
while len(urls) > 0: | |
#remove the first URL to pull the LR documents from | |
lr_url = urls.pop() | |
# make an HTTP GET request to the LR harvest interface | |
resp = urllib2.urlopen(lr_url) | |
try: | |
#parse json from the response body | |
data = json.loads(resp.read()) | |
# iterate over the results | |
for i in data['listrecords']: | |
#for the rest of this code we only care about the LR envelope portion of the harvest result | |
envelope = i['record']['resource_data'] | |
# process the envelope | |
process_envelope(envelope) | |
# if there is a resumption token | |
if "resumption_token" in data and \ | |
data['resumption_token'] is not None and \ | |
data['resumption_token'] != "null": | |
#parse the origional URL and update the query string to contain the resumption_token | |
url_parts = urlparse.urlparse(lr_url) | |
new_query = urllib.urlencode({"resumption_token": data['resumption_token']}) | |
next_url = urlparse.urlunparse((url_parts.scheme, | |
url_parts.netloc, | |
url_parts.path, | |
url_parts.params, | |
new_query, | |
url_parts.fragment)) | |
#add the URL for the next page of results to the urls array | |
urls.append(next_url) | |
except Exception as ex: | |
print(ex) | |
print(lr_url) | |
def process_envelope(envelope): | |
#pprint(envelope) | |
#print(envelope['doc_ID']) | |
#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array | |
# - https://github.com/LearningRegistry/LearningRegistry/wiki/Common-Data-Schema-Formats-in-Learning-Registry | |
schemas = {schema.lower() for schema in envelope.get('payload_schema', [])} | |
try: | |
if 'lom' in schemas: | |
process_lom(envelope) | |
elif 'nsdl_dc' in schemas: | |
process_nsdl_dc(envelope) | |
elif 'lrmi' in schemas: | |
process_lrmi(envelope) | |
elif 'comm_para 1.0' in schemas: | |
process_comm_para(envelope) | |
except Exception as ex: | |
print(ex) | |
print("Error In Payload") | |
harvest("https://node01.public.learningregistry.net/harvest/listrecords") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment