Skip to content

Instantly share code, notes, and snippets.

@rlskoeser
Created November 3, 2016 16:28
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rlskoeser/d18c19a8351d97ca933b64fd26048b98 to your computer and use it in GitHub Desktop.
Save rlskoeser/d18c19a8351d97ca933b64fd26048b98 to your computer and use it in GitHub Desktop.
Python script to generate CSV summary data from a Zotero RDF library export
#! /usr/bin/env python
# simple script to pull data from a Zotero library RDF export
# and generate a CSV file with identifier, type of item, title,
# date, and the number of tags
#
# Only supports book and bookSection item types, all other items are ignored
#
# The CSV file will be generated with the same base name as the
# RDF input file.
# to install dependencies:
# pip install rdflib unicodecsv
# exmple usage:
# python zotero-rdf-tags.py zotero-export.rdf
import argparse
import codecs
import os
import rdflib
from rdflib.namespace import DC, FOAF, RDF, DCTERMS
from unicodecsv import DictWriter
parser = argparse.ArgumentParser(description='Generate CSV from Zotero RDF')
parser.add_argument('filename', metavar='FILE',
help='Path to the Zoteero RDF export')
args = parser.parse_args()
ZOTERO_NS = rdflib.Namespace("http://www.zotero.org/namespaces/export#")
g = rdflib.Graph()
g.parse(args.filename)
items = []
for book, itemtype in g.subject_objects(ZOTERO_NS.itemType):
# for now, only handle books and book sections
if str(itemtype) not in ["book", "bookSection"]:
continue
tags = [str(t) for t in g.objects(book, DC.subject)]
# use rdf about for id
rdfid = str(book)
# handle ids that are not full uris
if args.filename in rdfid:
# split on anchor marker, just keep local item did
rdfid = rdfid.split('#', 1)[1]
items.append({
'identifier': rdfid,
'type': itemtype,
'title': g.value(book, DC.title),
'date': g.value(book, DC.date) or '', # not all have dates
'tags': ', '.join(tags),
'# tags': len(tags),
'# tags ending in Y': len([t for t in tags if t.endswith('Y')])
})
items = sorted(items, key=lambda k: k['# tags ending in Y'], reverse=True)
# generate csv file name based on input file
filebase, ext = os.path.splitext(os.path.basename(args.filename))
csv_filename = '%s.csv' % filebase
with open(csv_filename, 'w') as csvfile:
# write byte-order-mark for utf-8 opening in
csvfile.write(codecs.BOM_UTF8)
fieldnames = ['identifier', 'type', 'title', 'date', '# tags',
'# tags ending in Y', 'tags']
writer = DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in items:
writer.writerow(item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment