Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Code to delete Zotero duplicate attachments from your library. (Finds duplicates that share an md5 signature!)
# Depends on the pyzotero library
# Also depends on the humanize library, but this is only to pretty print the space savings.
logfile = open('log.txt','w')
from pyzotero import zotero
# Replace with YOUR_USERID and YOUR_KEY
zot = zotero.Zotero('5798092', 'user', '91Ua0ENhWIb8YOKCqiCwsKrP', preserve_json_order=True)
import time
t0 = time.time()
items = zot.everything(
# items = # for testing
t1 = time.time()
print('Time to retrieve full library: {} ({} items)', (t1-t0), len(items))
# About 140 s for ~2K entries
md5_dict = {}
for i in items:
if (i['meta']['numChildren'] > 1):
children = zot.children(i['key'])
for ch in children:
if (ch['data']['itemType'] == 'attachment') and \
(ch['data']['linkMode'] == 'imported_file') and \
(ch['data']['contentType'] == 'application/pdf'):
md5_dict.setdefault(ch['data']['md5'], []).append(
(ch['data']['key'], ch['data']['title'],
ch['links']['enclosure']['length'], i['key']) )
t2 = time.time()
num_copies = [len(val) for key, val in md5_dict.items()]
print('Time for children accesses: {} ({} per child)'.format(t2-t1, (t2-t1)/sum(num_copies)))
# About 1200 s for ~4000 attachments.
# Note that we have to retrieve non-PDF attachments. Maybe there's a way to avoid that?
import humanize # just needed for pretty printing file size
size_to_be_erased = [a[2] for md5, attachment_list in md5_dict.items()
if len(attachment_list) > 1
for a in attachment_list[1:]]
print('Total size to be erased: {}'.format(humanize.naturalsize(sum(size_to_be_erased),gnu=True)))
# About 6 GB for me! Woohoo!
for md5, attachment in md5_dict.items():
if len(attachment) > 1:
keys = [a[0] for a in attachment]
print('Keys: [{}]'.format(', '.join(keys)))
print('Keys: [{}]'.format(', '.join(keys)), file=logfile)
for tbd in keys[1:]:
item = zot.item(tbd)
print('Deleting: {}'.format(item['key']))
t3 = time.time()
num_erased = [len(attachment_list) - 1 for md5, attachment_list in md5_dict.items()]
print('Time for deletions: {} ({} per child)'.format(t3-t2, (t3-t2)/sum(num_erased)))
# 2353 s for 2401 deletions or just about 1 per
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment