wegrata/gist:6295401

## gistfile1.py
import urllib2
import json
import urlparse
import urllib
from pprint import pprint
from lxml import etree

def process_nsdl_dc(envelope):
    #parse the resource_data into an XML dom object
    dom = etree.fromstring(envelope['resource_data'])
    #dictionary containing XML namespaces and their prefixes
    dc_namespaces = {"nsdl_dc": "http://ns.nsdl.org/nsdl_dc_v1.02/",
                     "dc": "http://purl.org/dc/elements/1.1/",
                     "dct": "http://purl.org/dc/terms/"}
    # run an XPath query againt the dom object that pulls out all the document titles
    titles = dom.xpath('/nsdl_dc:nsdl_dc/dc:title',
                       namespaces=dc_namespaces)
    # extract a set of all the titles from the DOM elements
    pprint({elm.text for elm in titles})

def process_lom(envelope):
    #parse the resource_data into an XML dom object
    dom = etree.fromstring(envelope['resource_data'])
    #dictionary containing XML namespaces and their prefixes
    lom_namespaces = {
    "lom": "http://ltsc.ieee.org/xsd/LOM"
    }
    # run an XPath query againt the dom object that pulls out all the document titles
    titles = dom.xpath("//lom:lom/lom:general/lom:title/lom:string",
                                 namespaces=lom_namespaces)
    # extract a set of all the titles from the DOM elements
    pprint({elm.text for elm in titles})

def process_comm_para(envelope):
    #parse the resource_data into an XML dom object
    dom = etree.fromstring(envelope['resource_data'])
    #dictionary containing XML namespaces and their prefixes
    comm_namespaces = {
        "comm": "http://ns.nsdl.org/ncs/comm_para",
        "oai": "http://www.openarchives.org/OAI/2.0/"
        }
    # run an XPath query againt the dom object that pulls out all the record IDs
    record_ids = dom.xpath("//oai:metadata/comm:commParadata/comm:recordId",
                                 namespaces=comm_namespaces)
    # extract a set of all the titles from the DOM elements
    pprint({elm.text for elm in record_ids})

def process_lrmi(envelope):
    print('lrmi')
    names = set()
    #LRMI is json so no DOM stuff is needed
    for item in envelope['resource_data']['items']:
        #add the name(s) from the item to a set
        names.add(item['name'])
    # print all the names
    pprint(names)

def process_envelope(envelope):
    print(envelope['doc_ID'])
    #normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
    schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
    try:
        if 'lom' in schemas:
            process_lom(envelope)
        elif 'nsdl_dc' in schemas:
            process_nsdl_dc(envelope)
        elif 'lrmi' in schemas:
            process_lrmi(envelope)
        elif 'comm_para 1.0' in schemas:
            process_comm_para(envelope)
    except Exception as ex:
        print(ex)
        print("Error In Payload")

def harvest(start_url):
    #start by adding the root URL to the list of urls to harvest from
    urls = [start_url]
    #while we have url to harvest from continue
    while len(urls) > 0:
        #remove the first URL to pull the LR documents from
        lr_url = urls.pop()
        # make an HTTP GET request to the LR harvest interface
        resp = urllib2.urlopen(lr_url)
        try:
            #parse json from the response body
            data = json.loads(resp.read())
            # iterate over the results
            for i in data['listrecords']:
                #for the rest of this code we only care about the LR envelope portion of the harvest result
                envelope = i['record']['resource_data']
                # process the envelope
                process_envelope(envelope)
                # if there is a resumption token
            if "resumption_token" in data and \
                    data['resumption_token'] is not None and \
                    data['resumption_token'] != "null":
                #parse the origional URL and update the query string to contain the resumption_token
                url_parts = urlparse.urlparse(lr_url)
                new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
                next_url = urlparse.urlunparse((url_parts.scheme,
                                                url_parts.netloc,
                                                url_parts.path,
                                                url_parts.params,
                                                new_query,
                                                url_parts.fragment))
                #add the URL for the next page of results to the urls array
                urls.append(next_url)
        except Exception as ex:
            print(ex)
            print(lr_url)


def main():
    harvest("https://node01.public.learningregistry.net/harvest/listrecords")

if __name__ == "__main__":
    main()
	import urllib2
	import json
	import urlparse
	import urllib
	from pprint import pprint
	from lxml import etree

	def process_nsdl_dc(envelope):
	#parse the resource_data into an XML dom object
	dom = etree.fromstring(envelope['resource_data'])
	#dictionary containing XML namespaces and their prefixes
	dc_namespaces = {"nsdl_dc": "http://ns.nsdl.org/nsdl_dc_v1.02/",
	"dc": "http://purl.org/dc/elements/1.1/",
	"dct": "http://purl.org/dc/terms/"}
	# run an XPath query againt the dom object that pulls out all the document titles
	titles = dom.xpath('/nsdl_dc:nsdl_dc/dc:title',
	namespaces=dc_namespaces)
	# extract a set of all the titles from the DOM elements
	pprint({elm.text for elm in titles})

	def process_lom(envelope):
	#parse the resource_data into an XML dom object
	dom = etree.fromstring(envelope['resource_data'])
	#dictionary containing XML namespaces and their prefixes
	lom_namespaces = {
	"lom": "http://ltsc.ieee.org/xsd/LOM"
	}
	# run an XPath query againt the dom object that pulls out all the document titles
	titles = dom.xpath("//lom:lom/lom:general/lom:title/lom:string",
	namespaces=lom_namespaces)
	# extract a set of all the titles from the DOM elements
	pprint({elm.text for elm in titles})

	def process_comm_para(envelope):
	#parse the resource_data into an XML dom object
	dom = etree.fromstring(envelope['resource_data'])
	#dictionary containing XML namespaces and their prefixes
	comm_namespaces = {
	"comm": "http://ns.nsdl.org/ncs/comm_para",
	"oai": "http://www.openarchives.org/OAI/2.0/"
	}
	# run an XPath query againt the dom object that pulls out all the record IDs
	record_ids = dom.xpath("//oai:metadata/comm:commParadata/comm:recordId",
	namespaces=comm_namespaces)
	# extract a set of all the titles from the DOM elements
	pprint({elm.text for elm in record_ids})

	def process_lrmi(envelope):
	print('lrmi')
	names = set()
	#LRMI is json so no DOM stuff is needed
	for item in envelope['resource_data']['items']:
	#add the name(s) from the item to a set
	names.add(item['name'])
	# print all the names
	pprint(names)

	def process_envelope(envelope):
	print(envelope['doc_ID'])
	#normalize casing on all the schemas in the payload_schema array, if payload_schema isn't present use an empty array
	schemas = {schema.lower() for schema in envelope.get('payload_schema', [])}
	try:
	if 'lom' in schemas:
	process_lom(envelope)
	elif 'nsdl_dc' in schemas:
	process_nsdl_dc(envelope)
	elif 'lrmi' in schemas:
	process_lrmi(envelope)
	elif 'comm_para 1.0' in schemas:
	process_comm_para(envelope)
	except Exception as ex:
	print(ex)
	print("Error In Payload")

	def harvest(start_url):
	#start by adding the root URL to the list of urls to harvest from
	urls = [start_url]
	#while we have url to harvest from continue
	while len(urls) > 0:
	#remove the first URL to pull the LR documents from
	lr_url = urls.pop()
	# make an HTTP GET request to the LR harvest interface
	resp = urllib2.urlopen(lr_url)
	try:
	#parse json from the response body
	data = json.loads(resp.read())
	# iterate over the results
	for i in data['listrecords']:
	#for the rest of this code we only care about the LR envelope portion of the harvest result
	envelope = i['record']['resource_data']
	# process the envelope
	process_envelope(envelope)
	# if there is a resumption token
	if "resumption_token" in data and \
	data['resumption_token'] is not None and \
	data['resumption_token'] != "null":
	#parse the origional URL and update the query string to contain the resumption_token
	url_parts = urlparse.urlparse(lr_url)
	new_query = urllib.urlencode({"resumption_token": data['resumption_token']})
	next_url = urlparse.urlunparse((url_parts.scheme,
	url_parts.netloc,
	url_parts.path,
	url_parts.params,
	new_query,
	url_parts.fragment))
	#add the URL for the next page of results to the urls array
	urls.append(next_url)
	except Exception as ex:
	print(ex)
	print(lr_url)


	def main():
	harvest("https://node01.public.learningregistry.net/harvest/listrecords")

	if __name__ == "__main__":
	main()