Skip to content

Instantly share code, notes, and snippets.

@rlskoeser
Last active December 12, 2015 01:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rlskoeser/4693891 to your computer and use it in GitHub Desktop.
Save rlskoeser/4693891 to your computer and use it in GitHub Desktop.
Script to generate GeoRSS from CSV files generated by NameDropper lookup-names script. See http://disc.library.emory.edu/networkingbelfast/places-in-around-the-world-in-80-days/
#!/usr/bin/env python
# to get dependencies: pip install unicodecsv eulxml namedropper
# as of 2012/02/01 this requires the development version of namedropper,
# but should work with the 0.3 version once it is released
from collections import OrderedDict
import glob
import unicodecsv
from datetime import datetime
from os.path import basename, splitext
import sys
import logging
from eulxml import xmlmap
from namedropper.spotlight import DBpediaResource
logging.basicConfig(level=logging.INFO)
def main():
filelist = glob.iglob('./chapter??.csv')
# order based on when a name first occurs
ids = OrderedDict()
file_count = 0
for csv_filename in filelist:
file_label, ext = splitext(basename(csv_filename))
# get number from filename (chapter##)
chapter_number = int(file_label[len('chapter'):]) + 1
print >> sys.stderr, "Chapter %d" % chapter_number
file_count += 1
with open(csv_filename, 'rb') as csvfile:
# infer field names from header
csvreader = unicodecsv.DictReader(csvfile)
for row in csvreader:
uri = row['URI']
if uri not in ids:
ids[uri] = {
'count': 1,
'label': row['Name'], # store surface form in case label lookup fails
'text': '<p><a href="%s">DBpedia record</a></p>' \
% (uri)
}
# verbose mode
print >> sys.stderr, uri
else:
ids[uri]['count'] += 1
# highlight surface form of the annotation in context
txt = row['Context'].replace(row['Name'],
'<b>%s</b>' % row['Name'])
ids[uri]['text'] += '<p><i>Chapter %d</i>: ..%s..</p>\n' % \
(chapter_number, txt)
print >> sys.stderr, 'Found %s unique ids in %s files' % \
(len(ids.keys()), file_count)
feed = GeoRSSFeed(version='2.0',
title='Places in "Around the World in 80 Days"',
description='Places mentioned in the text of Jules Verne\'s' + \
'"Around the World in 80 Days" as identified by DBpedia Spotlight',
pub_date=datetime.now()
)
for uri, data in ids.iteritems():
res = DBpediaResource(uri)
feed.items.append(GeoRSSEntry(
title=res.label or data['label'], # note: could include count of occurrences in label...
description=data['text'],
latitude=res.latitude,
longitude=res.longitude
))
print feed.serialize(pretty=True)
# georss xml objects
class GeoRSSEntry(xmlmap.XmlObject):
ROOT_NAMESPACES = {
'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
}
ROOT_NAME = 'item'
title = xmlmap.StringField('title')
link = xmlmap.StringField('link')
latitude = xmlmap.StringField('geo:lat')
longitude = xmlmap.StringField('geo:long')
description = xmlmap.StringField('description')
class GeoRSSFeed(xmlmap.XmlObject):
ROOT_NAMESPACES = {
'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
}
ROOT_NAME = 'rss'
version = xmlmap.StringField('@version')
title = xmlmap.StringField('channel/title')
description = xmlmap.StringField('channel/description')
link = xmlmap.StringField('channel/link')
pub_date = xmlmap.DateTimeField('channel/pubDate')
items = xmlmap.NodeListField('channel/item', GeoRSSEntry)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment