hbarrington/harvest.py

## harvest.py
#based on documentation from https://github.com/LearningRegistry/LearningRegistry/wiki/Consuming-Learning-Registry-Records

import urllib2
import urllib
import json
import urlparse
from pprint import pprint

def harvest(start_url):
    #start by adding the root URL to the list of urls to harvest from
    urls = [start_url]
    #while we have url to harvest from continue
    while len(urls) > 0:
        #remove the first URL to pull the LR documents from
        lr_url = urls.pop()
        # make an HTTP GET request to the LR harvest interface
        resp = urllib2.urlopen(lr_url)
        try:
            #parse json from the response body
            data = json.loads(resp.read())
            # iterate over the results
            for i in data['listrecords']:
                #for the rest of this code we only care about the LR envelope portion of the harvest result
                envelope = i['record']['resource_data']
                # process the envelope
                process_envelope(envelope)
                # if there is a resumption token
                if "resumption_token" in data and \
                        data['resumption_token'] is not None and \
                        data['resumption_token'] != "null":
                    #parse the origional URL and update the query string to contain the resumption_token
                    url_parts = urlparse.urlparse(lr_url)
                    new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
                    next_url = urlparse.urlunparse((url_parts.scheme,
                                                    url_parts.netloc,
                                                    url_parts.path,
                                                    url_parts.params,
                                                    new_query,
                                                    url_parts.fragment))
                    #add the URL for the next page of results to the urls array
                    urls.append(next_url)
        except Exception as ex:
            print(ex)
            print(lr_url)


def process_envelope(envelope):
    #pprint(envelope)

    #print(envelope['doc_ID'])
    #normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
    # - https://github.com/LearningRegistry/LearningRegistry/wiki/Common-Data-Schema-Formats-in-Learning-Registry
    schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
    try:
        if 'lom' in schemas:
            process_lom(envelope)
        elif 'nsdl_dc' in schemas:
            process_nsdl_dc(envelope)
        elif 'lrmi' in schemas:
            process_lrmi(envelope)
        elif 'comm_para 1.0' in schemas:
            process_comm_para(envelope)
    except Exception as ex:
        print(ex)
        print("Error In Payload")


harvest("https://node01.public.learningregistry.net/harvest/listrecords")
	#based on documentation from https://github.com/LearningRegistry/LearningRegistry/wiki/Consuming-Learning-Registry-Records

	import urllib2
	import urllib
	import json
	import urlparse
	from pprint import pprint

	def harvest(start_url):
	#start by adding the root URL to the list of urls to harvest from
	urls = [start_url]
	#while we have url to harvest from continue
	while len(urls) > 0:
	#remove the first URL to pull the LR documents from
	lr_url = urls.pop()
	# make an HTTP GET request to the LR harvest interface
	resp = urllib2.urlopen(lr_url)
	try:
	#parse json from the response body
	data = json.loads(resp.read())
	# iterate over the results
	for i in data['listrecords']:
	#for the rest of this code we only care about the LR envelope portion of the harvest result
	envelope = i['record']['resource_data']
	# process the envelope
	process_envelope(envelope)
	# if there is a resumption token
	if "resumption_token" in data and \
	data['resumption_token'] is not None and \
	data['resumption_token'] != "null":
	#parse the origional URL and update the query string to contain the resumption_token
	url_parts = urlparse.urlparse(lr_url)
	new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
	next_url = urlparse.urlunparse((url_parts.scheme,
	url_parts.netloc,
	url_parts.path,
	url_parts.params,
	new_query,
	url_parts.fragment))
	#add the URL for the next page of results to the urls array
	urls.append(next_url)
	except Exception as ex:
	print(ex)
	print(lr_url)


	def process_envelope(envelope):
	#pprint(envelope)

	#print(envelope['doc_ID'])
	#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
	# - https://github.com/LearningRegistry/LearningRegistry/wiki/Common-Data-Schema-Formats-in-Learning-Registry
	schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
	try:
	if 'lom' in schemas:
	process_lom(envelope)
	elif 'nsdl_dc' in schemas:
	process_nsdl_dc(envelope)
	elif 'lrmi' in schemas:
	process_lrmi(envelope)
	elif 'comm_para 1.0' in schemas:
	process_comm_para(envelope)
	except Exception as ex:
	print(ex)
	print("Error In Payload")


	harvest("https://node01.public.learningregistry.net/harvest/listrecords")