Skip to content

Instantly share code, notes, and snippets.

@ckemere
Last active April 15, 2024 21:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ckemere/44178399f9b104d9fb35e2b8d7c7cd20 to your computer and use it in GitHub Desktop.
Save ckemere/44178399f9b104d9fb35e2b8d7c7cd20 to your computer and use it in GitHub Desktop.
Code to delete Zotero duplicate attachments from your library. (Finds duplicates that share an md5 signature!)
#!/usr/bin/python3
# Depends on the pyzotero library
# Also depends on the humanize library, but this is only to pretty print the space savings.
#%%
logfile = open('log.txt','w')
from pyzotero import zotero
# Replace with YOUR_USERID and YOUR_KEY
zot = zotero.Zotero('5798092', 'user', '91Ua0ENhWIb8YOKCqiCwsKrP', preserve_json_order=True)
import time
t0 = time.time()
items = zot.everything(zot.top())
# items = zot.top(limit=100) # for testing
t1 = time.time()
print('Time to retrieve full library: {} ({} items)', (t1-t0), len(items))
# About 140 s for ~2K entries
#%%
md5_dict = {}
for i in items:
if (i['meta']['numChildren'] > 1):
children = zot.children(i['key'])
print(i['data']['title'])
for ch in children:
if (ch['data']['itemType'] == 'attachment') and \
(ch['data']['linkMode'] == 'imported_file') and \
(ch['data']['contentType'] == 'application/pdf'):
md5_dict.setdefault(ch['data']['md5'], []).append(
(ch['data']['key'], ch['data']['title'],
ch['links']['enclosure']['length'], i['key']) )
t2 = time.time()
num_copies = [len(val) for key, val in md5_dict.items()]
print('Time for children accesses: {} ({} per child)'.format(t2-t1, (t2-t1)/sum(num_copies)))
# About 1200 s for ~4000 attachments.
# Note that we have to retrieve non-PDF attachments. Maybe there's a way to avoid that?
#%%
import humanize # just needed for pretty printing file size
size_to_be_erased = [a[2] for md5, attachment_list in md5_dict.items()
if len(attachment_list) > 1
for a in attachment_list[1:]]
print('Total size to be erased: {}'.format(humanize.naturalsize(sum(size_to_be_erased),gnu=True)))
# About 6 GB for me! Woohoo!
#%%
for md5, attachment in md5_dict.items():
if len(attachment) > 1:
keys = [a[0] for a in attachment]
print('Keys: [{}]'.format(', '.join(keys)))
print('Keys: [{}]'.format(', '.join(keys)), file=logfile)
for tbd in keys[1:]:
item = zot.item(tbd)
print('Deleting: {}'.format(item['key']))
t3 = time.time()
num_erased = [len(attachment_list) - 1 for md5, attachment_list in md5_dict.items()]
print('Time for deletions: {} ({} per child)'.format(t3-t2, (t3-t2)/sum(num_erased)))
# 2353 s for 2401 deletions or just about 1 per
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment