Skip to content

Instantly share code, notes, and snippets.

@adefossez
Last active March 3, 2023 14:33
Show Gist options
  • Save adefossez/85cd1b2183f63088aea463a0f36b3a2c to your computer and use it in GitHub Desktop.
Save adefossez/85cd1b2183f63088aea463a0f36b3a2c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Author: Alexandre Défossez, 2020
# This is free and unencumbered software released into the public domain.
# For more information, please refer to <http://unlicense.org/>
"""
Merge multiple bibfiles, remove duplicates and unused references, matching bibtex entries
based on the 'title' field. Rewrite all the .tex files in the current directory
to reflect the elimination of duplicates.
Finally, this will rewrite all the arXiv references to use the @unpublished category.
To use this, go to the main() function where everything is hardcoded and update it
to suite your case :D
IMPORTANT: Make a copy of your files before running this, as this script will
overwrite all the .tex files and I can't garantee it is bug proof.
"""
import glob
import re
import sys
def colorize(text, color):
"""
Return `text` wrapped in ANSI color code sequence, with the given color.
`color` should be a string, see the following link for a reference:
https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences
"""
code = f"\033[{color}m"
restore = f"\033[0m"
return "".join([code, text, restore])
def bold(text):
"""
Wrap text so as to display it in bold in the terminal.
"""
return colorize(text, "1")
def fatal(in_bold, *args):
print(bold(in_bold), *args, file=sys.stderr)
sys.exit(1)
def consumer(buf):
"""
Returns a `consume` function over the given string `buf`.
The returned function `consume` can be repeatidly called with
a single regex pattern as argument. It will match the pattern
on the current value of `buf` and if it gets a match, will remove
the matched prefix from `buf` and return the match. Otherwise, return
`None` and does not change `buf`.
"""
def consume(pattern):
nonlocal buf
match = re.match(pattern, buf, re.MULTILINE | re.DOTALL)
if match is None:
return None
buf = buf[match.end():]
return match
return consume
def consume_block(consume):
"""
Consume a block of the form `content}` from a bibtex consumer.
Inner blocks like `a {inner} b}` are also matched and added
verbatim to the output.
Note that this function will not match the opening `{`.
"""
content = ""
while True:
match = consume("([^{}]*)([{}])")
if not match:
fatal("Missing }", consume(r".{,50}").group(0))
content += match.group(1)
if match.group(2) == "{":
sub = consume_block(consume)
content += "{" + sub + "}"
else:
return content
def parse_bib(bib):
"""
Parse a bibtex from the string `bib`.
Return a list of dict, with the following keys:
- kind: lowercased of what is after the @.
- alias: the name of the bibtex entry.
- title, author, year, etc.: any entry in the bibtex is also added.
Comments are only supported when at the beginning of the line and are
added as a special entry of kind "comment" and with a key "content"
containing the value of the comment without the initial '%'.
Comments can also start with '#'
"""
content = []
consume = consumer(bib)
while bib:
match = consume(r"\s*^\s*[%#]([^\n]*)$")
if match:
content.append({"kind": "comment", "content": match.group(1)})
continue
match = consume(r"\s*@(\w+)\{\s*([^\s,]+)")
if not match:
break
entry = {
"alias": match.group(2),
"kind": match.group(1).lower(),
}
content.append(entry)
while True:
match = consume(r"\s*(?:,?\s*(})|,\s*(\w+)\s*=\s*({?))")
if not match:
fatal("Parsing error, next chars in buffer:",
consume(r".{,50}").group(0))
sys.exit(1)
if match.group(1) == "}":
break
key = match.group(2)
if match.group(3) == "{":
value = "{" + consume_block(consume) + "}"
else:
# The value is not surrounded by {},
match = consume(r"\s*(\w+)")
if not match:
fatal("Missing value for", entry['alias'], key)
value = match.group(1)
entry[key.lower()] = value
consume(r"\s*")
remaining = consume(r".{,50}").group(0)
if remaining:
fatal("Parsing error, unparsed leftovers:", remaining)
return content
def find_used(root):
"""
Search all tex files for citation with cite, citet and citep.
Multiple comma separated citations are supported.
"""
texs = glob.glob(root + '/**/*.tex', recursive=True)
out = set([])
for tex in texs:
buf = open(tex).read()
for match in re.finditer(r'\\cite[tp]?{([^}]+)}', buf):
cites = [i.strip() for i in match.group(1).split(",")]
out |= set(cites)
return out
def replace(root, replacements):
"""
Parse all the .tex files under the root folder, search for cite[tp]
and apply replacement rules from `replacements`.
`replacements` should be a dict `old_name: new_name`.
.tex files are modified inplace.
"""
texs = glob.glob(root + '/**/*.tex', recursive=True)
for tex in texs:
buf = open(tex).read()
processed = []
while buf:
match = re.search(r'\\(cite[tp]?){([^}]+)}', buf)
if not match:
break
processed.append(buf[:match.start()])
buf = buf[match.end():]
command = match.group(1)
cites = match.group(2).split(",")
ncites = []
for cite in cites:
if cite in replacements:
cite = replacements[cite]
ncites.append(cite)
o = "\\" + command + "{" + ",".join(ncites) + "}"
processed.append(o)
processed.append(buf)
open(tex, "w").write("".join(processed))
def remove_unsused(entries, used):
removed = []
for entry in list(entries):
# only remove things with titles, the rest could be comments
# or other weird bibtex
if 'title' in entry and entry['alias'] not in used:
entries.remove(entry)
removed.append(entry['alias'])
return removed
def normalize(title):
return title.strip("{}").strip().lower()
def is_same(a, b):
return normalize(a['title']) == normalize(b['title'])
def find_duplicates(entries, used):
replacements = {}
for entry in list(entries):
if 'title' not in entry:
continue
if entry['alias'] not in used:
continue
found = False
for candidate in entries:
if 'title' not in candidate:
continue
if is_same(candidate, entry):
break
if candidate is not entry:
replacements[entry['alias']] = candidate['alias']
entries.remove(entry)
used.add(candidate['alias'])
return replacements
def dumps_bib(entries):
"""
Format the given entries to bibtex.
"""
out = []
for entry in entries:
if entry['kind'] == 'comment':
out.append('%' + entry['content'] + '\n\n')
continue
entry = dict(entry)
kind = entry.pop('kind')
alias = entry.pop('alias')
o = f'@{kind}{{{alias},\n'
for k, v in entry.items():
o += f'\t{k}={v},\n'
o += '}\n\n'
out.append(o)
return "".join(out)
def replace_arxiv(entries):
for entry in entries:
if 'title' not in entry:
continue
number = None
if entry['kind'] == 'techreport' and entry.get('institution') == '{arXiv}':
number = entry['number'].strip('{}')
del entry['institution']
elif entry['kind'] == 'article' and 'journal' in entry:
match = re.match(r'{*(?:arXiv )?preprint arXiv:([\d.]+)}*$', entry['journal'])
if match is not None:
del entry['journal']
number = match.group(1)
if number is not None:
entry['kind'] = 'unpublished'
entry['note'] = '{Preprint on arXiv:' + number + '}'
def main():
# Add all the bib files you ever used
# Order is important, in case of duplicates, the first alias matching a title
# will be kept as the final one.
bibfiles = ["ref_optim.bib", "adam/references.bib", "adabatch/references.bib",
"ref_audio.bib", "sing/references.bib", "demucs/references.bib"]
refs = []
for bibfile in bibfiles:
entries = parse_bib(open(bibfile).read())
entries.insert(0, {"kind": "comment", "content": "%" * 79})
entries.insert(0, {
"kind": "comment",
"content": " The following references were extracted from " + bibfile})
entries.insert(0, {"kind": "comment", "content": "%" * 79})
for entry in entries:
entry['_source'] = bibfile
refs += entries
# Find all used references in the current folder, and sub directories
used = find_used(".")
print(bold("Used references:"), list(used))
# set up replacement rules for duplicates, also update used aliases
replacements = find_duplicates(refs, used)
print(bold("Duplicates found:"))
for old, new in replacements.items():
print(old, "->", new)
# remove all unused entries
removed = remove_unsused(refs, used)
print(bold("Removed references:"), removed)
# Apply replacement rules to all the cite/citet/citep in the current folder
# This will override the tex files, so please save a copy before use.
replace(".", replacements)
# Replace all arxiv references to use the @unpublished kind, with a note
replace_arxiv(refs)
# Split back into multiple files based on whatever rule you hardcode.
refs_optim = []
refs_audio = []
for ref in refs:
source = ref.pop('_source').split('.')[0].split('/')[0]
if source in ["ref_optim", "adam", "adabatch"]:
refs_optim.append(ref)
else:
refs_audio.append(ref)
open("clean_audio.bib", "w").write(dumps_bib(refs_audio))
open("clean_optim.bib", "w").write(dumps_bib(refs_optim))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment