Skip to content

Instantly share code, notes, and snippets.

@mhoban
Created June 27, 2022 00:00
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhoban/3564f789a934028f9898b0a316588dd1 to your computer and use it in GitHub Desktop.
Save mhoban/3564f789a934028f9898b0a316588dd1 to your computer and use it in GitHub Desktop.
python script to fix inconsistent author names in a zotero library (adapted from https://gist.github.com/douglasrizzo/b1d324d0698120ebf8b1c0c91d8c251c)
#!/usr/bin/env python3
"""
Usage: fix_authors.py (((-g | -u) (-k <api-key> -i <lib-id>)) | (-c <config-file>)) [-f -q -s <config-file> -n <to-ignore> ...]
Options:
-g --group Library is a group library
-u --user Library is a user library
-k --key <api-key> Zotero API key
-i --id <lib-id> Zotero library ID
-c --config <config-file> Specify config file (json) with library/API details
-s --save-config <config-file> (optional) Save specified options as a config file
-n --ignore <to-ignore> (optional) author names to ignore
-f --fix-names (optional) try to fix messed up author names
-q --quit (optional) quit after fixing author names
Details:
* When passing library details on the command line, only -g OR -u may be specified (not both)
* Similarly, either specify library details OR pass a config file
* Some journals (lookin' at you, Science!) export authors as a single 'name' field rather
than a last and first/initial combo, which messes things up. The fix-names option will
attempt to fix this.
"""
import sys, os
import unicodedata
import Levenshtein
import json
from pyzotero import zotero
from docopt import docopt
from nameparser import HumanName
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
# tries to fix names, modifies pubs list
def fix_names(pubs,exceptions=[]):
modified = set()
for pub in range(len(pubs)):
pub_printed = False
all_authors = False
for i,author in enumerate(pubs[pub]['data']['creators']):
if 'name' in author and author['name'] not in exceptions:
if not pub_printed:
print(pubs[pub]['data']['title'])
pub_printed = True
try:
old_name = author['name']
space = old_name.find(' ')
name = HumanName(old_name[:space] + ',' + old_name[space:])
if not all_authors:
yn = input(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"? [Yna]")
else:
yn = 'y'
print(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"")
if yn.lower() in ['y','','a']:
if yn == 'a':
all_authors = True
del pubs[pub]['data']['creators'][i]['name']
pubs[pub]['data']['creators'][i]['firstName'] = name.first + ' ' + name.middle
pubs[pub]['data']['creators'][i]['lastName'] = name.last
modified.add(pub)
except KeyboardInterrupt:
print("\nquitting")
sys.exit(1)
except Exception as e:
print(f"something went wrong: {e}")
pass
return modified
def only_uppers(s):
"""Returns only the uppercase letters in a string"""
return ''.join([c for c in s if c.isupper()])
def remove_accents(s):
"""Replaces accented characters by their unnacented counterparts"""
return unicodedata.normalize('NFC', s)
def fullname(person):
# print(person)
try:
return f"{person['lastName']}, {person['firstName']}"
except:
try:
return person['name']
except:
return ""
def firstname(person):
try:
return person['firstName']
except:
try:
return person['name']
except:
return ""
def lastname(person):
try:
return person['lastName']
except:
return ""
def same_person_different_name(a, b):
"""Compare two names and checks if they possibly belong to the same person.
If the strings that represent both full names are different, but the
initials are the same and the unnaccented versions of the last nam are equal,
returns True."""
a_full_name = fullname(a)
a_last_name = lastname(a)
b_full_name = fullname(b)
b_last_name = lastname(b)
return a_full_name != b_full_name and only_uppers(a_full_name) == only_uppers(b_full_name) and remove_accents(a_last_name) == remove_accents(b_last_name)
def best_name(a, b):
"""Keep the best of two names to represent a person. Gives preference to the
longest one (which is not abbreviated) and also to the one with most accents."""
# TODO: figure out what kind of person these are
best_person = a if len(fullname(a)) >= len(fullname(b)) else b
# best_last_name = ' '.join(best_person.last_names)
best_last_name = lastname(best_person)
a_last_name = lastname(a)
b_last_name = lastname(b)
a_accents_pct = Levenshtein.ratio(remove_accents(a_last_name), a_last_name)
b_accents_pct = Levenshtein.ratio(remove_accents(b_last_name), b_last_name)
best_person['lastName'] = lastname(a) if a_accents_pct > b_accents_pct else lastname(b)
return best_person
def main():
opt = {arg.lstrip('-') : value for arg, value in docopt(__doc__).items()}
config = {"lib_id": None, "lib_type": None, "api_key": None, "exceptions": None}
if opt['config'] is not None:
if os.path.exists(opt['config']):
try:
with open(opt['config'],"rt") as f:
config = json.load(f)
except Exception as e:
eprint(f"json error: {e}")
sys.exit(1)
else:
eprint("config file does not exist")
sys.exit(1)
else:
config['lib_id'] = opt['id']
config['lib_type'] = 'group' if opt['group'] else 'user'
config['api_key'] = opt['key']
config['exceptions'] = opt['ignore']
if opt['save-config'] is not None:
with open(opt['save-config'],'w') as f:
json.dump(config,f)
try:
zot = zotero.Zotero(config['lib_id'],config['lib_type'],config['api_key'])
except Exception as e:
eprint(f"Bad Zotero configuration: {e}")
authors = []
edited_pubs = set()
try:
print("fetching pubs from zotero library...")
pubs = zot.top()
except Exception as e:
eprint(f"Zotero API error: {e}")
sys.exit(1)
if opt['fix-names']:
print("attempting to fix author names...")
edited_pubs = fix_names(pubs,exceptions=config['exceptions'])
if opt['quit']:
if len(edited_pubs) > 0:
to_edit = [pubs[i] for i in edited_pubs]
print("updating edited authors...")
success = zot.update_items(to_edit)
if success:
print("success!")
else:
print("something went wrong")
else:
print("all names seem fine!")
sys.exit(0)
print(f"fixed {len(edited_pubs)} names")
for pub in pubs:
for author in pub['data']['creators']:
if author not in authors:
authors.append(author)
for pub_index, pub in enumerate(pubs):
for author_index, current_author in enumerate(pub['data']['creators']):
chosen_name = current_author
for new_author in authors:
try:
if same_person_different_name(chosen_name,new_author):
chosen_name = best_name(new_author,chosen_name)
edited_pubs.add(pub_index)
print(f"{fullname(chosen_name)} / {fullname(new_author)} => {fullname(chosen_name)}")
except:
eprint("something went wrong with the names")
pubs[pub_index]['data']['creators'][author_index] = chosen_name
to_edit = [pubs[i] for i in edited_pubs]
if len(to_edit) > 0:
yn = input(f"commit edits to {len(to_edit)} pubs (y/n)? ")
if yn.lower() == 'y':
success = zot.update_items(to_edit)
if success:
print("success!")
else:
print("something went wrong with the update")
else:
print("all names seem fine and authors appear to be consistent")
if __name__ == "__main__":
main()
@xishansnow
Copy link

Is there any thing about the config file?

@mhoban
Copy link
Author

mhoban commented Sep 28, 2022

Here's an example, it's a JSON file:

{
  "lib_id": "<library ID>",
  "lib_type": "<library type>",
  "api_key": "<api key>",
  "exceptions": [
    "<exception1>",
    "<exception2>"    
  ]
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment