Instantly share code, notes, and snippets.
Created
June 27, 2022 00:00
-
Save mhoban/3564f789a934028f9898b0a316588dd1 to your computer and use it in GitHub Desktop.
python script to fix inconsistent author names in a zotero library (adapted from https://gist.github.com/douglasrizzo/b1d324d0698120ebf8b1c0c91d8c251c)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Usage: fix_authors.py (((-g | -u) (-k <api-key> -i <lib-id>)) | (-c <config-file>)) [-f -q -s <config-file> -n <to-ignore> ...] | |
Options: | |
-g --group Library is a group library | |
-u --user Library is a user library | |
-k --key <api-key> Zotero API key | |
-i --id <lib-id> Zotero library ID | |
-c --config <config-file> Specify config file (json) with library/API details | |
-s --save-config <config-file> (optional) Save specified options as a config file | |
-n --ignore <to-ignore> (optional) author names to ignore | |
-f --fix-names (optional) try to fix messed up author names | |
-q --quit (optional) quit after fixing author names | |
Details: | |
* When passing library details on the command line, only -g OR -u may be specified (not both) | |
* Similarly, either specify library details OR pass a config file | |
* Some journals (lookin' at you, Science!) export authors as a single 'name' field rather | |
than a last and first/initial combo, which messes things up. The fix-names option will | |
attempt to fix this. | |
""" | |
import sys, os | |
import unicodedata | |
import Levenshtein | |
import json | |
from pyzotero import zotero | |
from docopt import docopt | |
from nameparser import HumanName | |
def eprint(*args, **kwargs): | |
print(*args, file=sys.stderr, **kwargs) | |
# tries to fix names, modifies pubs list | |
def fix_names(pubs,exceptions=[]): | |
modified = set() | |
for pub in range(len(pubs)): | |
pub_printed = False | |
all_authors = False | |
for i,author in enumerate(pubs[pub]['data']['creators']): | |
if 'name' in author and author['name'] not in exceptions: | |
if not pub_printed: | |
print(pubs[pub]['data']['title']) | |
pub_printed = True | |
try: | |
old_name = author['name'] | |
space = old_name.find(' ') | |
name = HumanName(old_name[:space] + ',' + old_name[space:]) | |
if not all_authors: | |
yn = input(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"? [Yna]") | |
else: | |
yn = 'y' | |
print(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"") | |
if yn.lower() in ['y','','a']: | |
if yn == 'a': | |
all_authors = True | |
del pubs[pub]['data']['creators'][i]['name'] | |
pubs[pub]['data']['creators'][i]['firstName'] = name.first + ' ' + name.middle | |
pubs[pub]['data']['creators'][i]['lastName'] = name.last | |
modified.add(pub) | |
except KeyboardInterrupt: | |
print("\nquitting") | |
sys.exit(1) | |
except Exception as e: | |
print(f"something went wrong: {e}") | |
pass | |
return modified | |
def only_uppers(s): | |
"""Returns only the uppercase letters in a string""" | |
return ''.join([c for c in s if c.isupper()]) | |
def remove_accents(s): | |
"""Replaces accented characters by their unnacented counterparts""" | |
return unicodedata.normalize('NFC', s) | |
def fullname(person): | |
# print(person) | |
try: | |
return f"{person['lastName']}, {person['firstName']}" | |
except: | |
try: | |
return person['name'] | |
except: | |
return "" | |
def firstname(person): | |
try: | |
return person['firstName'] | |
except: | |
try: | |
return person['name'] | |
except: | |
return "" | |
def lastname(person): | |
try: | |
return person['lastName'] | |
except: | |
return "" | |
def same_person_different_name(a, b): | |
"""Compare two names and checks if they possibly belong to the same person. | |
If the strings that represent both full names are different, but the | |
initials are the same and the unnaccented versions of the last nam are equal, | |
returns True.""" | |
a_full_name = fullname(a) | |
a_last_name = lastname(a) | |
b_full_name = fullname(b) | |
b_last_name = lastname(b) | |
return a_full_name != b_full_name and only_uppers(a_full_name) == only_uppers(b_full_name) and remove_accents(a_last_name) == remove_accents(b_last_name) | |
def best_name(a, b): | |
"""Keep the best of two names to represent a person. Gives preference to the | |
longest one (which is not abbreviated) and also to the one with most accents.""" | |
# TODO: figure out what kind of person these are | |
best_person = a if len(fullname(a)) >= len(fullname(b)) else b | |
# best_last_name = ' '.join(best_person.last_names) | |
best_last_name = lastname(best_person) | |
a_last_name = lastname(a) | |
b_last_name = lastname(b) | |
a_accents_pct = Levenshtein.ratio(remove_accents(a_last_name), a_last_name) | |
b_accents_pct = Levenshtein.ratio(remove_accents(b_last_name), b_last_name) | |
best_person['lastName'] = lastname(a) if a_accents_pct > b_accents_pct else lastname(b) | |
return best_person | |
def main(): | |
opt = {arg.lstrip('-') : value for arg, value in docopt(__doc__).items()} | |
config = {"lib_id": None, "lib_type": None, "api_key": None, "exceptions": None} | |
if opt['config'] is not None: | |
if os.path.exists(opt['config']): | |
try: | |
with open(opt['config'],"rt") as f: | |
config = json.load(f) | |
except Exception as e: | |
eprint(f"json error: {e}") | |
sys.exit(1) | |
else: | |
eprint("config file does not exist") | |
sys.exit(1) | |
else: | |
config['lib_id'] = opt['id'] | |
config['lib_type'] = 'group' if opt['group'] else 'user' | |
config['api_key'] = opt['key'] | |
config['exceptions'] = opt['ignore'] | |
if opt['save-config'] is not None: | |
with open(opt['save-config'],'w') as f: | |
json.dump(config,f) | |
try: | |
zot = zotero.Zotero(config['lib_id'],config['lib_type'],config['api_key']) | |
except Exception as e: | |
eprint(f"Bad Zotero configuration: {e}") | |
authors = [] | |
edited_pubs = set() | |
try: | |
print("fetching pubs from zotero library...") | |
pubs = zot.top() | |
except Exception as e: | |
eprint(f"Zotero API error: {e}") | |
sys.exit(1) | |
if opt['fix-names']: | |
print("attempting to fix author names...") | |
edited_pubs = fix_names(pubs,exceptions=config['exceptions']) | |
if opt['quit']: | |
if len(edited_pubs) > 0: | |
to_edit = [pubs[i] for i in edited_pubs] | |
print("updating edited authors...") | |
success = zot.update_items(to_edit) | |
if success: | |
print("success!") | |
else: | |
print("something went wrong") | |
else: | |
print("all names seem fine!") | |
sys.exit(0) | |
print(f"fixed {len(edited_pubs)} names") | |
for pub in pubs: | |
for author in pub['data']['creators']: | |
if author not in authors: | |
authors.append(author) | |
for pub_index, pub in enumerate(pubs): | |
for author_index, current_author in enumerate(pub['data']['creators']): | |
chosen_name = current_author | |
for new_author in authors: | |
try: | |
if same_person_different_name(chosen_name,new_author): | |
chosen_name = best_name(new_author,chosen_name) | |
edited_pubs.add(pub_index) | |
print(f"{fullname(chosen_name)} / {fullname(new_author)} => {fullname(chosen_name)}") | |
except: | |
eprint("something went wrong with the names") | |
pubs[pub_index]['data']['creators'][author_index] = chosen_name | |
to_edit = [pubs[i] for i in edited_pubs] | |
if len(to_edit) > 0: | |
yn = input(f"commit edits to {len(to_edit)} pubs (y/n)? ") | |
if yn.lower() == 'y': | |
success = zot.update_items(to_edit) | |
if success: | |
print("success!") | |
else: | |
print("something went wrong with the update") | |
else: | |
print("all names seem fine and authors appear to be consistent") | |
if __name__ == "__main__": | |
main() |
Here's an example, it's a JSON file:
{
"lib_id": "<library ID>",
"lib_type": "<library type>",
"api_key": "<api key>",
"exceptions": [
"<exception1>",
"<exception2>"
]
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Is there any thing about the config file?