rdhyee/dpla_query.py

## dpla_query.py
# Goal:  feed a bunch of search terms to try to get at some collections
# something to compare to:  https://gist.github.com/4046626

# API doc: https://github.com/dpla/platform/wiki
# data sources: http://dp.la/wiki/Platform_test_data_sources

import requests
import json
import urllib
from itertools import islice

# Retrieve an item by ID
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb

# Retrieve multiple items by ID
# http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb


def dpla_query(**kw_input):

    kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"}

    # fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates
    extras = kw_input.pop('extras',{})
    kw_input.update(extras)

    kwargs.update(kw_input)
    kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None])

    # asc vs desc

    # available text search fields
    text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial")
    expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source']

    # temporal fields
    # http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30

    # location available...not implemented here
    more_items = True

   # content["count"], content["start"], content["limit"]

    while more_items:
        r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs))
        content = json.loads(r.content)
        if len(content["docs"]):
            for doc in content["docs"]:
                yield (doc, content["count"])
            if kwargs['sort_order'] == 'desc':
                kwargs['page'] -= 1
            else:
                kwargs['page'] += 1
        else:
            more_items = False


# search terms to feed in

SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"]
# collections
collections = set()

for term in SEARCH_TERMS:
    results = list(islice(dpla_query(q=term),100))

    for (i, (doc, count)) in enumerate(results):
        collections.add(doc.get('isPartOf', {'title':None}).get('title'))

print len(collections)

for collection in collections:
    print collection
	# Goal: feed a bunch of search terms to try to get at some collections
	# something to compare to: https://gist.github.com/4046626

	# API doc: https://github.com/dpla/platform/wiki
	# data sources: http://dp.la/wiki/Platform_test_data_sources

	import requests
	import json
	import urllib
	from itertools import islice

	# Retrieve an item by ID
	# http://api.dp.la/v1/items/a4e2346032cae75b0832abe064c14bcb

	# Retrieve multiple items by ID
	# http://api.dp.la/v1/items/a4e2346032cae75b0832abe0644e9b26,a4e2346032cae75b0832abe064c14bcb


	def dpla_query(**kw_input):

	kwargs = {"page_size": 20, "page": 1, "sort_order":"asc"}

	# fudgy -- allow an extra parameter to allow for ones that can fit kw_input -- e.g., spatial.coordinates
	extras = kw_input.pop('extras',{})
	kw_input.update(extras)

	kwargs.update(kw_input)
	kwargs = dict([(k,v) for (k,v) in kwargs.items() if v is not None])

	# asc vs desc

	# available text search fields
	text_search_fields = ("title", "description", "dplaContributor", "creator", "type", "publisher", "format", "rights", "contributor", "spatial")
	expected_doc_fields = ['title','description', 'creator', 'type', 'publisher', 'format', 'rights', 'contributor', 'created', 'spatial', 'temporal', 'source']

	# temporal fields
	# http://api.dp.la/v1/items?temporal.after=1963-11-01&temporal.before=1963-11-30

	# location available...not implemented here
	more_items = True

	# content["count"], content["start"], content["limit"]

	while more_items:
	r = requests.get("http://api.dp.la/v1/items?" + urllib.urlencode(kwargs))
	content = json.loads(r.content)
	if len(content["docs"]):
	for doc in content["docs"]:
	yield (doc, content["count"])
	if kwargs['sort_order'] == 'desc':
	kwargs['page'] -= 1
	else:
	kwargs['page'] += 1
	else:
	more_items = False


	# search terms to feed in

	SEARCH_TERMS = ["Bach", "tree", "horse", "cow", "Gore"]
	# collections
	collections = set()

	for term in SEARCH_TERMS:
	results = list(islice(dpla_query(q=term),100))

	for (i, (doc, count)) in enumerate(results):
	collections.add(doc.get('isPartOf', {'title':None}).get('title'))

	print len(collections)

	for collection in collections:
	print collection