retorquere/cited.py

## cited.py
#!/usr/bin/env python

from zipfile import ZipFile
import os
import csv
import sys
import re
import json
from pprint import pprint
import xml.etree.ElementTree as ET

library_path = [f for f in sys.argv[1:] if f.lower().endswith('.csv')][0]
header = None
library = { }
with open(library_path, encoding='utf-8-sig') as csvfile:
  for row in csv.reader(csvfile):
    if header is None:
      header = row
    else:
      row = dict(zip(header, row))

      title = row['Title']

      author = row['Author'].split(';')[0].strip() if row['Author'] else ''
      year = row['Publication Year'] if row['Publication Year'] else ''
      author_year = ', '.join([ay for ay in [ author, year ] if ay != ''])
      if author_year != '': title += f' ({author_year})'

      library[row['Key']] = title

documents = [f for f in sys.argv[1:] if f.lower().endswith('.docx')]
namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

cited = {}
for docx in documents:
  with ZipFile(docx) as f:
    root = ET.parse(f.open('word/document.xml'))
    for elem in root.findall('.//w:instrText', namespaces):
      match = re.search(r'ADDIN ZOTERO_ITEM CSL_CITATION (\{.*\})$', elem.text.strip())
      if not match: continue
      citation = json.loads(match.group(1))

      for item in citation['citationItems']:
        for uri in item['uri']:
          key = uri.split('/')[-1]

          if not key in library: continue
          if not key in cited: cited[key] = { 'title': library[key], 'documents': [] }
          cited[key]['documents'].append(docx)

writer = csv.writer(open('cited.csv', 'w'), quoting=csv.QUOTE_MINIMAL)
for item in cited.values():
  writer.writerow([item['title']] + item['documents'])
	#!/usr/bin/env python

	from zipfile import ZipFile
	import os
	import csv
	import sys
	import re
	import json
	from pprint import pprint
	import xml.etree.ElementTree as ET

	library_path = [f for f in sys.argv[1:] if f.lower().endswith('.csv')][0]
	header = None
	library = { }
	with open(library_path, encoding='utf-8-sig') as csvfile:
	for row in csv.reader(csvfile):
	if header is None:
	header = row
	else:
	row = dict(zip(header, row))

	title = row['Title']

	author = row['Author'].split(';')[0].strip() if row['Author'] else ''
	year = row['Publication Year'] if row['Publication Year'] else ''
	author_year = ', '.join([ay for ay in [ author, year ] if ay != ''])
	if author_year != '': title += f' ({author_year})'

	library[row['Key']] = title

	documents = [f for f in sys.argv[1:] if f.lower().endswith('.docx')]
	namespaces = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

	cited = {}
	for docx in documents:
	with ZipFile(docx) as f:
	root = ET.parse(f.open('word/document.xml'))
	for elem in root.findall('.//w:instrText', namespaces):
	match = re.search(r'ADDIN ZOTERO_ITEM CSL_CITATION (\{.*\})$', elem.text.strip())
	if not match: continue
	citation = json.loads(match.group(1))

	for item in citation['citationItems']:
	for uri in item['uri']:
	key = uri.split('/')[-1]

	if not key in library: continue
	if not key in cited: cited[key] = { 'title': library[key], 'documents': [] }
	cited[key]['documents'].append(docx)

	writer = csv.writer(open('cited.csv', 'w'), quoting=csv.QUOTE_MINIMAL)
	for item in cited.values():
	writer.writerow([item['title']] + item['documents'])