Skip to content

Instantly share code, notes, and snippets.

@alexandrehuat
Last active January 14, 2022 02:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexandrehuat/6d3263f73ccae87d0107977978316c02 to your computer and use it in GitHub Desktop.
Save alexandrehuat/6d3263f73ccae87d0107977978316c02 to your computer and use it in GitHub Desktop.
Mendeley-exported BibTeX cleaner
import sys
import re
from collections import Counter
import bibtexparser as bp
# ==============================================================================
# Process
# ==============================================================================
MONTH_INT = {"jan": 1, "feb": 2, "mar": 3, "apr": 4, "may": 5, "jun": 6, "jul": 7, "aug": 8, "sep": 9, "oct": 10, "nov": 11, "dec": 12}
def mapchain(funcs, iterable):
if funcs:
return mapchain(funcs[1:], map(funcs[0], iterable))
else:
return iterable
def rm_fields(entry, fields=["abstract", "keywords", "annote", "url"]):
for f in fields:
entry.pop(f, None)
return entry
def rm_httpsdoi(entry):
try:
entry["doi"] = entry["doi"].replace("https://doi.org/", '')
except KeyError:
pass
return entry
def correct_pages(entry):
try:
if "--" not in entry["pages"]:
entry["pages"] = entry["pages"].replace("-", "--")
entry["pages"] = entry["pages"].replace(" -- ", "--")
except KeyError:
pass
return entry
def correct_title(entry):
"""
Remove end point if it exists. Escape '&'.
"""
try:
if entry["title"][-1] == '.':
entry["title"] = entry["title"][:-1]
elif entry["title"][-2:] == '.}':
entry["title"] = entry["title"][:-2] + '}'
entry["title"] = entry["title"].replace('&', "\&")
except KeyError:
pass
return entry
def rm_escape_codes(word):
_word = word.replace('{', '').replace('}', '').replace("\i", '').replace("\j", '')
codes = "`'^" + '"' + "H~ckl=b.druvto"
for code in codes:
_word = _word.replace("\{}".format(code), '')
return _word
def _alias_sort_key(entry):
return "{:04}{:02}{:02}{}".format(int(entry.get("year", 9999)),
int(entry.get("month", 99)),
int(entry.get("day", 99)),
entry.get("title", 10 * 'z'))
def format_aliases_authyear(entries):
aliascount = Counter()
for entry in sorted(entries, key=_alias_sort_key):
name = rm_escape_codes(''.join(entry["author"].split(" and ")[0].split(", ")[0].split(' ')))
name = name[0].upper() + name[1:]
entry["ID"] = name + entry["year"]
aliascount.update([entry["ID"]])
if aliascount[entry["ID"]] > 1:
entry["ID"] += chr(ord('a') + aliascount[entry["ID"]] - 2)
return entries
def month2num(entry):
try:
for k, v in MONTH_INT.items():
if k in entry["month"]:
entry["month"] = str(v)
break
except KeyError:
pass
return entry
def correct_issn(entry):
try:
r = re.search("\w\w\w\w-?\w\w\w\w", entry["issn"])
if r:
entry["issn"] = entry["issn"][r.start():r.end()]
if '-' not in entry["issn"]:
entry["issn"] = '-'.join((entry["issn"][:4], entry["issn"][4:]))
except KeyError:
pass
return entry
# ==============================================================================
# Tests
# ==============================================================================
def strfentry(entry):
return '\n'.join(reversed(["{:16}{}".format(k, v)
for k, v in entry.items()]))
def print_fail(message, entry, fatal=True):
print("ERROR:" if fatal else "WARNING:", message)
print("Entry:", 80 * '-', strfentry(entry), 80 * '-', sep='\n')
def test_rm_fields(entry, fields=["abstract", "keywords", "annote", "url"]):
for f in fields:
if f in entry.keys():
print_fail("Field '{}' not removed.".format(f), entry)
return False
return True
def test_month2num(entry):
try:
if not 1 <= int(entry["month"]) <= 12:
print_fail("month is out of range [1, 12].", entry)
return False
except KeyError:
pass
except ValueError:
print_fail("month is not an integer.", entry)
return False
return True
def test_no_title_endp(entry):
try:
if '.' in entry["title"][-2:]:
print_fail("Title has an end punct.", entry)
return False
except KeyError:
pass
return True
def test_issn(entry):
try:
if not re.match("\w\w\w\w-\w\w\w\w$", entry["issn"]):
print_fail("Non-standard ISSN format.", entry, fatal=False)
except KeyError:
pass
return True
# ==============================================================================
# Main
# ==============================================================================
if __name__ == "__main__":
if len(sys.argv) != 3:
print("ERROR: Wrong number of arguments.")
print("Usage: python3 mendeley_bibtex_cleaner.py input_file.bib output_file.bib")
else:
input_file, output_file = sys.argv[1:3]
# Read
with open(input_file) as f:
db = bp.load(f)
# Clean
db.entries = list(mapchain([rm_fields,
month2num,
rm_httpsdoi,
correct_pages,
correct_title,
correct_issn], db.entries))
db.entries = format_aliases_authyear(db.entries)
# Test cleaning
success = True
for e in db.entries:
success &= all([test_rm_fields(e),
test_month2num(e),
test_no_title_endp(e),
test_issn(e)])
# Write
if success:
with open(output_file, 'w') as f:
bp.dump(db, f)
print("Cleaning successful.")
else:
print(20 * "*~%^" + "\nCleaning failed. No file was output.")
@alexandrehuat
Copy link
Author

alexandrehuat commented Apr 11, 2019

Usage

python3 path/to/mendeley_bibtex_cleaner.py input_file.bib output_file.bib

Suggested alias to be added in your .bashrc or .zshrc :

alias cleanmbib="python3 path/to/mendeley_bibtex_cleaner.py"

Requirements

  • bibtexparser

Install on Linux (systemwide)

sudo apt install -y python3-pip && sudo -H pip3 install bibtexparser

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment