Skip to content

Instantly share code, notes, and snippets.

@nfreear
Last active December 14, 2015 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nfreear/5111848 to your computer and use it in GitHub Desktop.
Save nfreear/5111848 to your computer and use it in GitHub Desktop.
OU search results in JSON(-P) / version 1.1 / Scraperwiki.
import scraperwiki
"""
OU Search Results in JSON(-P), v1.1.
Copyright 2013-03-07 Nick Freear, The Open University. All rights reserved.
License:
Usage: https://views.scraperwiki.com/run/ou_search_v1/?callback=FN&format=json&type=openlearn&limit=5&q=Bang&filter=%2BTop/mime/text%23html
Source: <https://gist.github.com/nfreear/5111848>
Ideas: <http://stackoverflow.com/questions/2081586/web-scraping-with-python>
"""
import os, cgi, re, json, urllib, urllib2
#from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
from bs4 import BeautifulSoup
search_rsp = {
'url' : 'http://search.open.ac.uk/',
'where': 'public',
'query': 'Bang',
'limit': '5',
'total': '0',
'range': '1-1',
'page' : '1',
'filter' : None,
'filter_ex': '+Top/mime/text#html',
'filter_ex_encode': '%2BTop%2Fmime%2Ftext%23html',
'results': []
}
search_url2 = '/search/results?'
p_search_where = re.compile('^(public|openlearn)$')
p_no_results = re.compile('No results for')
p_cb = re.compile('^[\$a-zA-Z_][\w\._]+$')
try:
qsenv = dict(cgi.parse_qsl(os.getenv("QUERY_STRING")))
if 'q' in qsenv:
search_rsp['query'] = qsenv['q']
#else:
# print "Error, 'q' search query parameter is missing."
# exit(-2)
if 'limit' in qsenv:
search_rsp['limit'] = qsenv['limit']
if 'where' in qsenv:
search_rsp['where'] = qsenv['where']
m_where = p_search_where.match(search_rsp['where'])
if not m_type:
print "Error, invalid 'where' parameter: ", search_rsp['where']
exit(-400)
if 'filter' in qsenv:
search_rsp['filter'] = qsenv['filter']
if 'callback' in qsenv:
callback = qsenv['callback']
m_cb = p_cb.match(callback)
if not m_cb:
print "Error, invalid 'callback' parameter: ", callback
exit(-4)
except:
print "Error, missing parameters (exception)."
exit(-1)
search_rsp['url'] += search_rsp['where'] + search_url2 + urllib.urlencode({ 'q': search_rsp['query'], 'hf': search_rsp['limit'] })
if search_rsp['filter']:
search_rsp['url'] += '&' + urllib.urlencode({ 'r' : search_rsp['filter'] })
soup = BeautifulSoup(urllib2.urlopen(search_rsp['url']).read())
try:
results_info = soup('span', {'id' : 'resultsInfo'})[0]
except:
print "Error, unknown search error."
exit(-1)
if results_info.find(text = p_no_results):
#print "Warning, no results for: ", search_rsp['query']
search_rsp['total'] = '0'
else:
search_rsp['total'] = results_info.contents[3].string
search_rsp['range'] = results_info.contents[1].string
search_rsp['page'] = results_info.contents[7].string
for a in soup.find_all('a', class_='openDocLink'):
search_rsp['results'].append({
# http://stackoverflow.com/questions/4488836/beautifulsoup-get-contents-as-a-single-string
'text': ''.join(str(it) for it in a.contents),
'text_plain': ' '.join(a.stripped_strings),
#'text': str(a.renderContents()),
#'text_plain': a.get_text(),
'url' : a['href'],
'type': ''.join(a.parent.find('a', href = re.compile('mime')).stripped_strings)
})
scraperwiki.utils.httpresponseheader("Content-Type", "application/json; charset=utf-8")
scraperwiki.utils.httpresponseheader("Content-Disposition", "inline; filename=ou-search-v1.json")
# Not allowed :( -- ..httpresponseheader("Access-Control-Allow-Origin", "*")
if 'callback' in qsenv:
print callback, '(', json.dumps(search_rsp), ');'
else:
print json.dumps(search_rsp)
exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment