rlskoeser/zotero-rdf-tags.py

## zotero-rdf-tags.py
#! /usr/bin/env python

# simple script to pull data from a Zotero library RDF export
# and generate a CSV file with identifier, type of item, title,
# date, and the number of tags
#
# Only supports book and bookSection item types, all other items are ignored
#
# The CSV file will be generated with the same base name as the
# RDF input file.

# to install dependencies:
#    pip install rdflib unicodecsv
# exmple usage:
#    python zotero-rdf-tags.py zotero-export.rdf

import argparse
import codecs
import os
import rdflib
from rdflib.namespace import DC, FOAF, RDF, DCTERMS
from unicodecsv import DictWriter


parser = argparse.ArgumentParser(description='Generate CSV from Zotero RDF')
parser.add_argument('filename', metavar='FILE',
                    help='Path to the Zoteero RDF export')

args = parser.parse_args()

ZOTERO_NS = rdflib.Namespace("http://www.zotero.org/namespaces/export#")

g = rdflib.Graph()
g.parse(args.filename)

items = []


for book, itemtype in g.subject_objects(ZOTERO_NS.itemType):
    # for now, only handle books and book sections
    if str(itemtype) not in ["book", "bookSection"]:
        continue
    tags = [str(t) for t in g.objects(book, DC.subject)]
    # use rdf about for id
    rdfid = str(book)
    # handle ids that are not full uris
    if args.filename in rdfid:
        # split on anchor marker, just keep local item did
        rdfid = rdfid.split('#', 1)[1]

    items.append({
        'identifier': rdfid,
        'type': itemtype,
        'title': g.value(book, DC.title),
        'date': g.value(book, DC.date) or '',   # not all have dates
        'tags': ', '.join(tags),
        '# tags': len(tags),
        '# tags ending in Y': len([t for t in tags if t.endswith('Y')])
        })

items = sorted(items, key=lambda k: k['# tags ending in Y'], reverse=True)

# generate csv file name based on input file
filebase, ext = os.path.splitext(os.path.basename(args.filename))
csv_filename = '%s.csv' % filebase

with open(csv_filename, 'w') as csvfile:
    # write byte-order-mark for utf-8 opening in
    csvfile.write(codecs.BOM_UTF8)
    fieldnames = ['identifier', 'type', 'title', 'date', '# tags',
                  '# tags ending in Y', 'tags']
    writer = DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for item in items:
        writer.writerow(item)
	#! /usr/bin/env python

	# simple script to pull data from a Zotero library RDF export
	# and generate a CSV file with identifier, type of item, title,
	# date, and the number of tags
	#
	# Only supports book and bookSection item types, all other items are ignored
	#
	# The CSV file will be generated with the same base name as the
	# RDF input file.

	# to install dependencies:
	# pip install rdflib unicodecsv
	# exmple usage:
	# python zotero-rdf-tags.py zotero-export.rdf

	import argparse
	import codecs
	import os
	import rdflib
	from rdflib.namespace import DC, FOAF, RDF, DCTERMS
	from unicodecsv import DictWriter


	parser = argparse.ArgumentParser(description='Generate CSV from Zotero RDF')
	parser.add_argument('filename', metavar='FILE',
	help='Path to the Zoteero RDF export')

	args = parser.parse_args()

	ZOTERO_NS = rdflib.Namespace("http://www.zotero.org/namespaces/export#")

	g = rdflib.Graph()
	g.parse(args.filename)

	items = []


	for book, itemtype in g.subject_objects(ZOTERO_NS.itemType):
	# for now, only handle books and book sections
	if str(itemtype) not in ["book", "bookSection"]:
	continue
	tags = [str(t) for t in g.objects(book, DC.subject)]
	# use rdf about for id
	rdfid = str(book)
	# handle ids that are not full uris
	if args.filename in rdfid:
	# split on anchor marker, just keep local item did
	rdfid = rdfid.split('#', 1)[1]

	items.append({
	'identifier': rdfid,
	'type': itemtype,
	'title': g.value(book, DC.title),
	'date': g.value(book, DC.date) or '', # not all have dates
	'tags': ', '.join(tags),
	'# tags': len(tags),
	'# tags ending in Y': len([t for t in tags if t.endswith('Y')])
	})

	items = sorted(items, key=lambda k: k['# tags ending in Y'], reverse=True)

	# generate csv file name based on input file
	filebase, ext = os.path.splitext(os.path.basename(args.filename))
	csv_filename = '%s.csv' % filebase

	with open(csv_filename, 'w') as csvfile:
	# write byte-order-mark for utf-8 opening in
	csvfile.write(codecs.BOM_UTF8)
	fieldnames = ['identifier', 'type', 'title', 'date', '# tags',
	'# tags ending in Y', 'tags']
	writer = DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for item in items:
	writer.writerow(item)