Skip to content

Instantly share code, notes, and snippets.

@mhermans
Last active August 29, 2015 14:19
Show Gist options
  • Save mhermans/05e0c537f13073044dc4 to your computer and use it in GitHub Desktop.
Save mhermans/05e0c537f13073044dc4 to your computer and use it in GitHub Desktop.
Python script to match Zotero group library metadata with Box-hosted ebook files
from boxsdk import Client, OAuth2 # https://github.com/box/box-python-sdk
from boxsdk.exception import BoxAPIException
from pyzotero import zotero # https://github.com/urschrei/pyzotero
import os.path
"""
This script scans a Box input-folder and a Zotero group-library, and matches the ebook PDF's
in the Box-folder with the metadata in the Zotero-library.
Matches are based on the SHA1 file checksum, which Box calculates and tracks automatically
for all files and which is added manually to the "Extra"-field of the Zotero-items.
Linking items between Box and Zotero happens by adding the public url for the Zotero item in
the Box file metadata, and adding the (company-restricted) public link to the url-field of the
Zotero item.
Additionally the ebook-file in the Box input-folder is renamed based on the Zotero-metadata,
and moved to a permanent Box ebook-folder.
End result is that ebooks and their metadata can be managed through Zotero, with the files
themselves hosted and accessable through Box and the Zotero web or offline interface.
Maarten Hermans | www.mhermans.net | maarten AT mhermans DOT net
"""
def zitem_ebook_hash(item):
"""Return sha1 ebook hash-value for zotero item."""
extra = item['data'].get('extra')
if extra and 'ebook_sha1' in extra:
ebook_hash = extra.split(':')[1]
else:
ebook_hash = None
return ebook_hash
def construct_filename(zotero_item, extention=None):
"""Construct a new filename based on zotero item metadata."""
title = zotero_item['data'].get('title')
title = zotero_item['data'].get('title')[0:55].strip().strip('.')
yr = zotero_item['data'].get('date') #[0:4] # TODO more robust year selection?
creators = zotero_item['data'].get('creators')
if len(creators) == 1:
a1 = creators[0].get('lastName')
fn = ' '.join([a1, '-', yr, '-'])
title = title[0:(80-len(fn))].strip(' .:')
fn = ' '.join([fn, title])
if len(creators) == 2:
a1 = creators[0].get('lastName')
a2 = creators[1].get('lastName')
fn = ' '.join([a1, 'and', a2, '-', yr, '-'])
title = title[0:(80-len(fn))].strip(' .:')
fn = ' '.join([fn, title])
if len(creators) > 2:
a1 = creators[0].get('lastName')
fn = ' '.join([a1, 'et al.', '-', yr, '-'])
title = title[0:(80-len(fn))].strip(' .:')
fn = ' '.join([fn, title])
if extention:
fn = '.'.join([fn, extention.strip('. ')])
return(fn)
def link_items(zotero_client, zot_item, box_item):
"""Link/set metadata between matching Zotero and Box items."""
# set zotero url-field to company-restricted url
# ----------------------------------------------
zot_item['data']['url'] = box_item.get_shared_link('company')
zotero_client.update_item(zot_item)
# add box-file metadata-field for zotero-url
# ------------------------------------------
try:
box_item_metadata = matched_box_item.metadata().get()
# get succeeded (metadata present) -> check if zotero_url present
# => only set if not present (TODO update?)
if not box_item_metadata.get('zotero_web_url'):
box_item.metadata().create({
'zotero_web_url': zot_item['links']['alternate']['href']})
except BoxAPIException:
# apparently error on get() if no metadata present -> create
box_item.metadata().create({
'zotero_web_url': zot_item['links']['alternate']['href']})
# rename box-file based on zotero metadata
# ----------------------------------------
extension = os.path.splitext(box_item['name'])[1] # get original ext.
fn_new = construct_filename(zot_item, extension)
if not box_item['name'] == fn_new:
box_item.rename(fn_new)
# TODO check if file already exits (currently throws box-error)
# Read in Zotero, Box authentication credentials
# ==============================================
# Zotero API credentials
zotero_api_key = 'Eme6vXpAaJd0p0'
zotero_user_id = '216'
zotero_group_id = '39'
# Box OAuth credentials (not SSO)
# https://developers.box.com/
# https://kuleuven.app.box.com/developers/services/edit/123613l
box_access_token = 'lfCPYI1z2t06llPWFHI6' # expires in 1h
box_client_id = '66gutzmxttjneszyvzx'
box_client_secret = 'SalwRViciDWVH3RGPZp'
box_input_folder_id = '33282'
box_ebooks_folder_id = '33230'
# Authenticate with Zotero and Box, intiatialize clients
# ======================================================
zot = zotero.Zotero(zotero_group_id, 'group', zotero_api_key)
oauth = OAuth2(
client_id=box_client_id,
client_secret=box_client_secret,
access_token=box_access_token
)
box = Client(oauth)
# Fetch items and their SHA1-file hash
# ====================================
# lists -> sha1-keyed dicts for zotero and box items
zot_items = zot.top()
zot_items_dict = {zitem_ebook_hash(item) : item for item in zot_items if zitem_ebook_hash(item)}
box_input_folder = box.folder(folder_id=box_input_folder_id)
box_ebooks_folder = box.folder(folder_id=box_ebooks_folder_id)
# fetch items, and fetch additional file-data for box items
box_items = box_input_folder.get_items(limit=100, offset=0)
box_items = [item.get() for item in box_items]
box_items_dict = {item['sha1'] : item for item in box_items}
# Match and link Zotero and Box items
# ===================================
# get matches on sha1-hash between box and zotero
matched_hashes = set(box_items_dict.keys()).intersection(zot_items_dict.keys())
print(len(matched_hashes))
# iterate over all matches, and link up items between zotero and box
for item_hash in list(matched_hashes):
matched_zot_item = zot_items_dict[item_hash]
matched_box_item = box_items_dict[item_hash]
print matched_zot_item['data']['title']
print matched_box_item['name']
# link both items: set box-url, zotero-url, etc.
link_items(zot, matched_zot_item, matched_box_item)
# move renamed file from input folder to ebooks folder
matched_box_item.move(box_ebooks_folder)
# list unmatched files (sha1 hash + filename) in input folder
# ===========================================================
for file_hash, item in box_items_dict.items():
print ':'.join(['ebook_sha1', file_hash]), item['name']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment