Skip to content

Instantly share code, notes, and snippets.

@hbarrington
Last active December 23, 2015 07:09
Show Gist options
  • Save hbarrington/6598468 to your computer and use it in GitHub Desktop.
Save hbarrington/6598468 to your computer and use it in GitHub Desktop.
Learning Registry Harvest example
#based on documentation from https://github.com/LearningRegistry/LearningRegistry/wiki/Consuming-Learning-Registry-Records
import urllib2
import urllib
import json
import urlparse
from pprint import pprint
def harvest(start_url):
#start by adding the root URL to the list of urls to harvest from
urls = [start_url]
#while we have url to harvest from continue
while len(urls) > 0:
#remove the first URL to pull the LR documents from
lr_url = urls.pop()
# make an HTTP GET request to the LR harvest interface
resp = urllib2.urlopen(lr_url)
try:
#parse json from the response body
data = json.loads(resp.read())
# iterate over the results
for i in data['listrecords']:
#for the rest of this code we only care about the LR envelope portion of the harvest result
envelope = i['record']['resource_data']
# process the envelope
process_envelope(envelope)
# if there is a resumption token
if "resumption_token" in data and \
data['resumption_token'] is not None and \
data['resumption_token'] != "null":
#parse the origional URL and update the query string to contain the resumption_token
url_parts = urlparse.urlparse(lr_url)
new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
next_url = urlparse.urlunparse((url_parts.scheme,
url_parts.netloc,
url_parts.path,
url_parts.params,
new_query,
url_parts.fragment))
#add the URL for the next page of results to the urls array
urls.append(next_url)
except Exception as ex:
print(ex)
print(lr_url)
def process_envelope(envelope):
#pprint(envelope)
#print(envelope['doc_ID'])
#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
# - https://github.com/LearningRegistry/LearningRegistry/wiki/Common-Data-Schema-Formats-in-Learning-Registry
schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
try:
if 'lom' in schemas:
process_lom(envelope)
elif 'nsdl_dc' in schemas:
process_nsdl_dc(envelope)
elif 'lrmi' in schemas:
process_lrmi(envelope)
elif 'comm_para 1.0' in schemas:
process_comm_para(envelope)
except Exception as ex:
print(ex)
print("Error In Payload")
harvest("https://node01.public.learningregistry.net/harvest/listrecords")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment