rlskoeser/csv2georss.py

## csv2georss.py
#!/usr/bin/env python

# to get dependencies: pip install unicodecsv eulxml namedropper
# as of 2012/02/01 this requires the development version of namedropper,
# but should work with the 0.3 version once it is released

from collections import OrderedDict
import glob
import unicodecsv
from datetime import datetime
from os.path import basename, splitext
import sys
import logging

from eulxml import xmlmap
from namedropper.spotlight import DBpediaResource

logging.basicConfig(level=logging.INFO)

def main():
    filelist = glob.iglob('./chapter??.csv')

    # order based on when a name first occurs
    ids = OrderedDict()

    file_count = 0
    for csv_filename in filelist:
        file_label, ext = splitext(basename(csv_filename))
        # get number from filename (chapter##)
        chapter_number = int(file_label[len('chapter'):]) + 1
        print >> sys.stderr, "Chapter %d" % chapter_number
        file_count += 1
        with open(csv_filename, 'rb') as csvfile:
            # infer field names from header
            csvreader = unicodecsv.DictReader(csvfile)
            for row in csvreader:
                uri = row['URI']
                if uri not in ids:
                    ids[uri] = {
                        'count': 1,
                        'label': row['Name'], # store surface form in case label lookup fails
                        'text': '<p><a href="%s">DBpedia record</a></p>' \
                            % (uri)
                    }
                    # verbose mode
                    print >> sys.stderr, uri
                else:
                    ids[uri]['count'] += 1

                # highlight surface form of the annotation in context
                txt = row['Context'].replace(row['Name'],
                    '<b>%s</b>' % row['Name'])
                ids[uri]['text'] += '<p><i>Chapter %d</i>: ..%s..</p>\n' % \
                    (chapter_number, txt)

    print >> sys.stderr, 'Found %s unique ids in %s files' % \
        (len(ids.keys()), file_count)

    feed = GeoRSSFeed(version='2.0',
        title='Places in "Around the World in 80 Days"',
        description='Places mentioned in the text of Jules Verne\'s' + \
        '"Around the World in 80 Days" as identified by DBpedia Spotlight',
        pub_date=datetime.now()
        )
    for uri, data in ids.iteritems():
        res = DBpediaResource(uri)
        feed.items.append(GeoRSSEntry(
            title=res.label or data['label'], # note: could include count of occurrences in label...
            description=data['text'],
            latitude=res.latitude,
            longitude=res.longitude
        ))

    print feed.serialize(pretty=True)


# georss xml objects

class GeoRSSEntry(xmlmap.XmlObject):
    ROOT_NAMESPACES = {
        'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
    }
    ROOT_NAME = 'item'
    title = xmlmap.StringField('title')
    link = xmlmap.StringField('link')
    latitude = xmlmap.StringField('geo:lat')
    longitude = xmlmap.StringField('geo:long')
    description = xmlmap.StringField('description')

class GeoRSSFeed(xmlmap.XmlObject):
    ROOT_NAMESPACES = {
        'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
    }
    ROOT_NAME = 'rss'
    version = xmlmap.StringField('@version')
    title = xmlmap.StringField('channel/title')
    description = xmlmap.StringField('channel/description')
    link = xmlmap.StringField('channel/link')
    pub_date = xmlmap.DateTimeField('channel/pubDate')
    items = xmlmap.NodeListField('channel/item', GeoRSSEntry)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	# to get dependencies: pip install unicodecsv eulxml namedropper
	# as of 2012/02/01 this requires the development version of namedropper,
	# but should work with the 0.3 version once it is released

	from collections import OrderedDict
	import glob
	import unicodecsv
	from datetime import datetime
	from os.path import basename, splitext
	import sys
	import logging

	from eulxml import xmlmap
	from namedropper.spotlight import DBpediaResource

	logging.basicConfig(level=logging.INFO)

	def main():
	filelist = glob.iglob('./chapter??.csv')

	# order based on when a name first occurs
	ids = OrderedDict()

	file_count = 0
	for csv_filename in filelist:
	file_label, ext = splitext(basename(csv_filename))
	# get number from filename (chapter##)
	chapter_number = int(file_label[len('chapter'):]) + 1
	print >> sys.stderr, "Chapter %d" % chapter_number
	file_count += 1
	with open(csv_filename, 'rb') as csvfile:
	# infer field names from header
	csvreader = unicodecsv.DictReader(csvfile)
	for row in csvreader:
	uri = row['URI']
	if uri not in ids:
	ids[uri] = {
	'count': 1,
	'label': row['Name'], # store surface form in case label lookup fails
	'text': '<p><a href="%s">DBpedia record</a></p>' \
	% (uri)
	}
	# verbose mode
	print >> sys.stderr, uri
	else:
	ids[uri]['count'] += 1

	# highlight surface form of the annotation in context
	txt = row['Context'].replace(row['Name'],
	'<b>%s</b>' % row['Name'])
	ids[uri]['text'] += '<p><i>Chapter %d</i>: ..%s..</p>\n' % \
	(chapter_number, txt)

	print >> sys.stderr, 'Found %s unique ids in %s files' % \
	(len(ids.keys()), file_count)

	feed = GeoRSSFeed(version='2.0',
	title='Places in "Around the World in 80 Days"',
	description='Places mentioned in the text of Jules Verne\'s' + \
	'"Around the World in 80 Days" as identified by DBpedia Spotlight',
	pub_date=datetime.now()
	)
	for uri, data in ids.iteritems():
	res = DBpediaResource(uri)
	feed.items.append(GeoRSSEntry(
	title=res.label or data['label'], # note: could include count of occurrences in label...
	description=data['text'],
	latitude=res.latitude,
	longitude=res.longitude
	))

	print feed.serialize(pretty=True)


	# georss xml objects

	class GeoRSSEntry(xmlmap.XmlObject):
	ROOT_NAMESPACES = {
	'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
	}
	ROOT_NAME = 'item'
	title = xmlmap.StringField('title')
	link = xmlmap.StringField('link')
	latitude = xmlmap.StringField('geo:lat')
	longitude = xmlmap.StringField('geo:long')
	description = xmlmap.StringField('description')

	class GeoRSSFeed(xmlmap.XmlObject):
	ROOT_NAMESPACES = {
	'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'
	}
	ROOT_NAME = 'rss'
	version = xmlmap.StringField('@version')
	title = xmlmap.StringField('channel/title')
	description = xmlmap.StringField('channel/description')
	link = xmlmap.StringField('channel/link')
	pub_date = xmlmap.DateTimeField('channel/pubDate')
	items = xmlmap.NodeListField('channel/item', GeoRSSEntry)


	if __name__ == '__main__':
	main()