Skip to content

Instantly share code, notes, and snippets.

@atbradley
Created April 2, 2015 12:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save atbradley/03afd8fb366eb7592a29 to your computer and use it in GitHub Desktop.
Save atbradley/03afd8fb366eb7592a29 to your computer and use it in GitHub Desktop.
"Read" a Wordpress blog and update documents indexed in Solr with links to relevant posts, based on tags. Written to index the catalog for the Hall-Hoag collection (http://library.brown.edu/collatoz/info.php?id=62)
import json
from lxml import etree
import requests
from datetime import date
def findId(xml):
for cat in xml.findall('category'):
if cat.text.startswith('HH_'):
return cat.text
return False
SOLR_HOST = 'localhost'
SOLR_PORT = 8983
SOLR_URI = '/solr/blacklight-core/update?commit=true'
SOLR_URL = "http://{}:{}{}".format(SOLR_HOST, SOLR_PORT, SOLR_URI)
print SOLR_URL
def solrUpdate(data):
msg = json.dumps([data])
resp = requests.post(SOLR_URL, msg, headers={'Content-Type':'application/json'})
print msg
return resp
yr = 2013
mt = 1
cdate = date(yr, mt, 1)
while cdate < date.today():
url = "http://blogs.brown.edu/hallhoag/{}/{:02}/feed/".format(yr, mt)
rss = etree.parse(url)
items = rss.findall("//channel/item")
for i in items:
params = {}
id = findId(i)
if id:
params['id'] = "US-RPB-{}".format(id)
params['source_link'] = {'add': i.findtext('link')}
resp = solrUpdate(params)
print resp.status_code
print resp.text
mt += 1
if mt > 12:
mt = 1
yr += 1
cdate = date(yr, mt, 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment