mhoban/fix_authors.py

## fix_authors.py
#!/usr/bin/env python3
"""
Usage: fix_authors.py (((-g | -u) (-k <api-key> -i <lib-id>)) | (-c <config-file>)) [-f -q -s <config-file> -n <to-ignore> ...]

Options:
  -g --group                      Library is a group library
  -u --user                       Library is a user library
  -k --key <api-key>              Zotero API key
  -i --id <lib-id>                Zotero library ID
  -c --config <config-file>       Specify config file (json) with library/API details
  -s --save-config <config-file>  (optional) Save specified options as a config file
  -n --ignore <to-ignore>         (optional) author names to ignore
  -f --fix-names                  (optional) try to fix messed up author names
  -q --quit                       (optional) quit after fixing author names

Details:
  * When passing library details on the command line, only -g OR -u may be specified (not both)
  * Similarly, either specify library details OR pass a config file
  * Some journals (lookin' at you, Science!) export authors as a single 'name' field rather
    than a last and first/initial combo, which messes things up. The fix-names option will
    attempt to fix this.
"""

import sys, os
import unicodedata
import Levenshtein
import json
from pyzotero import zotero
from docopt import docopt
from nameparser import HumanName


def eprint(*args, **kwargs):
  print(*args, file=sys.stderr, **kwargs)

# tries to fix names, modifies pubs list
def fix_names(pubs,exceptions=[]):
  modified = set()
  for pub in range(len(pubs)):
    pub_printed = False
    all_authors = False
    for i,author in enumerate(pubs[pub]['data']['creators']):
      if 'name' in author and author['name'] not in exceptions:
        if not pub_printed:
          print(pubs[pub]['data']['title'])
          pub_printed = True
        try:
          old_name = author['name']
          space = old_name.find(' ')
          name = HumanName(old_name[:space] + ',' + old_name[space:])
          if not all_authors:
            yn = input(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"? [Yna]")
          else:
            yn = 'y'
            print(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"")
          if yn.lower() in ['y','','a']:
            if yn == 'a':
              all_authors = True
            del pubs[pub]['data']['creators'][i]['name']
            pubs[pub]['data']['creators'][i]['firstName'] = name.first + ' ' + name.middle
            pubs[pub]['data']['creators'][i]['lastName'] = name.last
            modified.add(pub)
        except KeyboardInterrupt:
          print("\nquitting")
          sys.exit(1)
        except Exception as e:
          print(f"something went wrong: {e}")
          pass
  return modified


def only_uppers(s):
  """Returns only the uppercase letters in a string"""
  return ''.join([c for c in s if c.isupper()])

def remove_accents(s):
  """Replaces accented characters by their unnacented counterparts"""
  return unicodedata.normalize('NFC', s)

def fullname(person):
  #  print(person)
  try:
    return f"{person['lastName']}, {person['firstName']}"
  except:
    try:
      return person['name']
    except:
      return ""

def firstname(person):
  try:
    return person['firstName']
  except:
    try:
      return person['name']
    except:
      return ""

def lastname(person):
  try:
    return person['lastName']
  except:
    return ""

def same_person_different_name(a, b):
  """Compare two names and checks if they possibly belong to the same person.
     If the strings that represent both full names are different, but the
     initials are the same and the unnaccented versions of the last nam are equal,
     returns True."""
  a_full_name = fullname(a)
  a_last_name = lastname(a)
  b_full_name = fullname(b)
  b_last_name = lastname(b)

  return a_full_name != b_full_name and only_uppers(a_full_name) == only_uppers(b_full_name) and remove_accents(a_last_name) == remove_accents(b_last_name)

def best_name(a, b):
  """Keep the best of two names to represent a person. Gives preference to the
     longest one (which is not abbreviated) and also to the one with most accents."""
  # TODO: figure out what kind of person these are
  best_person = a if len(fullname(a)) >= len(fullname(b)) else b
  #  best_last_name = ' '.join(best_person.last_names)
  best_last_name = lastname(best_person)

  a_last_name = lastname(a)
  b_last_name = lastname(b)

  a_accents_pct = Levenshtein.ratio(remove_accents(a_last_name), a_last_name)
  b_accents_pct = Levenshtein.ratio(remove_accents(b_last_name), b_last_name)

  best_person['lastName'] = lastname(a) if a_accents_pct > b_accents_pct else lastname(b)

  return best_person

def main():
  opt = {arg.lstrip('-') : value for arg, value in docopt(__doc__).items()}
  config = {"lib_id": None, "lib_type": None, "api_key": None, "exceptions": None}

  if opt['config'] is not None:
    if os.path.exists(opt['config']):
      try:
        with open(opt['config'],"rt") as f:
          config = json.load(f)
      except Exception as e:
        eprint(f"json error: {e}")
        sys.exit(1)
    else:
      eprint("config file does not exist")
      sys.exit(1)
  else:
    config['lib_id'] = opt['id']
    config['lib_type'] = 'group' if opt['group'] else 'user'
    config['api_key'] = opt['key']
    config['exceptions'] = opt['ignore']

  if opt['save-config'] is not None:
    with open(opt['save-config'],'w') as f:
      json.dump(config,f)

  try:
    zot = zotero.Zotero(config['lib_id'],config['lib_type'],config['api_key'])
  except Exception as e:
    eprint(f"Bad Zotero configuration: {e}")

  authors = []
  edited_pubs = set()

  try:
    print("fetching pubs from zotero library...")
    pubs = zot.top()
  except Exception as e:
    eprint(f"Zotero API error: {e}")
    sys.exit(1)

  if opt['fix-names']:
    print("attempting to fix author names...")
    edited_pubs = fix_names(pubs,exceptions=config['exceptions'])
    if opt['quit']:
      if len(edited_pubs) > 0:
        to_edit = [pubs[i] for i in edited_pubs]
        print("updating edited authors...")
        success = zot.update_items(to_edit)
        if success:
          print("success!")
        else:
          print("something went wrong")
      else:
        print("all names seem fine!")
      sys.exit(0)
    print(f"fixed {len(edited_pubs)} names")

  for pub in pubs:
    for author in pub['data']['creators']:
      if author not in authors:
        authors.append(author)

  for pub_index, pub in enumerate(pubs):
    for author_index, current_author in enumerate(pub['data']['creators']):
      chosen_name = current_author
      for new_author in authors:
        try:
          if same_person_different_name(chosen_name,new_author):
            chosen_name = best_name(new_author,chosen_name)
            edited_pubs.add(pub_index)
            print(f"{fullname(chosen_name)} / {fullname(new_author)} => {fullname(chosen_name)}")
        except:
          eprint("something went wrong with the names")
      pubs[pub_index]['data']['creators'][author_index] = chosen_name
  to_edit = [pubs[i] for i in edited_pubs]
  if len(to_edit) > 0:
    yn = input(f"commit edits to {len(to_edit)} pubs (y/n)? ")
    if yn.lower() == 'y':
      success = zot.update_items(to_edit)
      if success:
        print("success!")
      else:
        print("something went wrong with the update")
  else:
    print("all names seem fine and authors appear to be consistent")

if __name__ == "__main__":
  main()
	#!/usr/bin/env python3
	"""
	Usage: fix_authors.py (((-g \| -u) (-k <api-key> -i <lib-id>)) \| (-c <config-file>)) [-f -q -s <config-file> -n <to-ignore> ...]

	Options:
	-g --group Library is a group library
	-u --user Library is a user library
	-k --key <api-key> Zotero API key
	-i --id <lib-id> Zotero library ID
	-c --config <config-file> Specify config file (json) with library/API details
	-s --save-config <config-file> (optional) Save specified options as a config file
	-n --ignore <to-ignore> (optional) author names to ignore
	-f --fix-names (optional) try to fix messed up author names
	-q --quit (optional) quit after fixing author names

	Details:
	* When passing library details on the command line, only -g OR -u may be specified (not both)
	* Similarly, either specify library details OR pass a config file
	* Some journals (lookin' at you, Science!) export authors as a single 'name' field rather
	than a last and first/initial combo, which messes things up. The fix-names option will
	attempt to fix this.
	"""

	import sys, os
	import unicodedata
	import Levenshtein
	import json
	from pyzotero import zotero
	from docopt import docopt
	from nameparser import HumanName


	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	# tries to fix names, modifies pubs list
	def fix_names(pubs,exceptions=[]):
	modified = set()
	for pub in range(len(pubs)):
	pub_printed = False
	all_authors = False
	for i,author in enumerate(pubs[pub]['data']['creators']):
	if 'name' in author and author['name'] not in exceptions:
	if not pub_printed:
	print(pubs[pub]['data']['title'])
	pub_printed = True
	try:
	old_name = author['name']
	space = old_name.find(' ')
	name = HumanName(old_name[:space] + ',' + old_name[space:])
	if not all_authors:
	yn = input(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"? [Yna]")
	else:
	yn = 'y'
	print(f"\t\"{old_name}\" => \"{name.last}\", \"{name.first} {name.middle}\"")
	if yn.lower() in ['y','','a']:
	if yn == 'a':
	all_authors = True
	del pubs[pub]['data']['creators'][i]['name']
	pubs[pub]['data']['creators'][i]['firstName'] = name.first + ' ' + name.middle
	pubs[pub]['data']['creators'][i]['lastName'] = name.last
	modified.add(pub)
	except KeyboardInterrupt:
	print("\nquitting")
	sys.exit(1)
	except Exception as e:
	print(f"something went wrong: {e}")
	pass
	return modified


	def only_uppers(s):
	"""Returns only the uppercase letters in a string"""
	return ''.join([c for c in s if c.isupper()])

	def remove_accents(s):
	"""Replaces accented characters by their unnacented counterparts"""
	return unicodedata.normalize('NFC', s)

	def fullname(person):
	# print(person)
	try:
	return f"{person['lastName']}, {person['firstName']}"
	except:
	try:
	return person['name']
	except:
	return ""

	def firstname(person):
	try:
	return person['firstName']
	except:
	try:
	return person['name']
	except:
	return ""

	def lastname(person):
	try:
	return person['lastName']
	except:
	return ""

	def same_person_different_name(a, b):
	"""Compare two names and checks if they possibly belong to the same person.
	If the strings that represent both full names are different, but the
	initials are the same and the unnaccented versions of the last nam are equal,
	returns True."""
	a_full_name = fullname(a)
	a_last_name = lastname(a)
	b_full_name = fullname(b)
	b_last_name = lastname(b)

	return a_full_name != b_full_name and only_uppers(a_full_name) == only_uppers(b_full_name) and remove_accents(a_last_name) == remove_accents(b_last_name)

	def best_name(a, b):
	"""Keep the best of two names to represent a person. Gives preference to the
	longest one (which is not abbreviated) and also to the one with most accents."""
	# TODO: figure out what kind of person these are
	best_person = a if len(fullname(a)) >= len(fullname(b)) else b
	# best_last_name = ' '.join(best_person.last_names)
	best_last_name = lastname(best_person)

	a_last_name = lastname(a)
	b_last_name = lastname(b)

	a_accents_pct = Levenshtein.ratio(remove_accents(a_last_name), a_last_name)
	b_accents_pct = Levenshtein.ratio(remove_accents(b_last_name), b_last_name)

	best_person['lastName'] = lastname(a) if a_accents_pct > b_accents_pct else lastname(b)

	return best_person

	def main():
	opt = {arg.lstrip('-') : value for arg, value in docopt(__doc__).items()}
	config = {"lib_id": None, "lib_type": None, "api_key": None, "exceptions": None}

	if opt['config'] is not None:
	if os.path.exists(opt['config']):
	try:
	with open(opt['config'],"rt") as f:
	config = json.load(f)
	except Exception as e:
	eprint(f"json error: {e}")
	sys.exit(1)
	else:
	eprint("config file does not exist")
	sys.exit(1)
	else:
	config['lib_id'] = opt['id']
	config['lib_type'] = 'group' if opt['group'] else 'user'
	config['api_key'] = opt['key']
	config['exceptions'] = opt['ignore']

	if opt['save-config'] is not None:
	with open(opt['save-config'],'w') as f:
	json.dump(config,f)

	try:
	zot = zotero.Zotero(config['lib_id'],config['lib_type'],config['api_key'])
	except Exception as e:
	eprint(f"Bad Zotero configuration: {e}")

	authors = []
	edited_pubs = set()

	try:
	print("fetching pubs from zotero library...")
	pubs = zot.top()
	except Exception as e:
	eprint(f"Zotero API error: {e}")
	sys.exit(1)

	if opt['fix-names']:
	print("attempting to fix author names...")
	edited_pubs = fix_names(pubs,exceptions=config['exceptions'])
	if opt['quit']:
	if len(edited_pubs) > 0:
	to_edit = [pubs[i] for i in edited_pubs]
	print("updating edited authors...")
	success = zot.update_items(to_edit)
	if success:
	print("success!")
	else:
	print("something went wrong")
	else:
	print("all names seem fine!")
	sys.exit(0)
	print(f"fixed {len(edited_pubs)} names")

	for pub in pubs:
	for author in pub['data']['creators']:
	if author not in authors:
	authors.append(author)

	for pub_index, pub in enumerate(pubs):
	for author_index, current_author in enumerate(pub['data']['creators']):
	chosen_name = current_author
	for new_author in authors:
	try:
	if same_person_different_name(chosen_name,new_author):
	chosen_name = best_name(new_author,chosen_name)
	edited_pubs.add(pub_index)
	print(f"{fullname(chosen_name)} / {fullname(new_author)} => {fullname(chosen_name)}")
	except:
	eprint("something went wrong with the names")
	pubs[pub_index]['data']['creators'][author_index] = chosen_name
	to_edit = [pubs[i] for i in edited_pubs]
	if len(to_edit) > 0:
	yn = input(f"commit edits to {len(to_edit)} pubs (y/n)? ")
	if yn.lower() == 'y':
	success = zot.update_items(to_edit)
	if success:
	print("success!")
	else:
	print("something went wrong with the update")
	else:
	print("all names seem fine and authors appear to be consistent")

	if __name__ == "__main__":
	main()