Skip to content

Instantly share code, notes, and snippets.

@mtreviso
Forked from TomWagg/removed_unused_bibs.py
Last active June 27, 2024 11:13
Show Gist options
  • Save mtreviso/25dc2c3987554795f76f1ed7efe34ff4 to your computer and use it in GitHub Desktop.
Save mtreviso/25dc2c3987554795f76f1ed7efe34ff4 to your computer and use it in GitHub Desktop.
bib reducer: Reduce and combine multiple .bib files to a single file with only *cited* entries. Works with ACL anthology.bib.
"""
>>> python remove_unused_bibs.py -h
usage: remove_unused_bibs.py [-h] [-b BIB_FILES] [-p PAPER_FILES] [-o OUT_FILE]
Remove unused bibtex entries
options:
-h, --help show this help message and exit
-b BIB_FILES, --bib_files BIB_FILES
Comma-separated list of bib files (default: paper.bib)
-p PAPER_FILES, --paper_files PAPER_FILES
Comma-separated list of paper files (default: paper.tex)
-o OUT_FILE, --out_file OUT_FILE
Output path for the minimal bib file (default: min.bib)
"""
import re
import argparse
def collect_all_bibs(bib_files):
# regex for anything that matches "@[some word]{[something],"
bibtex_key_regex = r"@\w*{.*(?=\,)"
# convert strings to lists in case only one file
if isinstance(bib_files, str):
bib_files = [bib_files]
# track the combined bib file as a string
all_bib_string = ""
all_keys = []
# read in each bib file
for file in bib_files:
with open(file, "r") as f:
bib_string = f.read()
all_bib_string += bib_string
# find all of the keys that match the regex
matches = re.findall(pattern=bibtex_key_regex, string=bib_string)
# trim off the preceding @whatever
all_keys += [m.split("{")[1] for m in matches]
# return unique set of keys and entire bib string
return sorted(list(set(all_keys))), all_bib_string
def check_bibs_used(paper_files, keys):
tex_regex = r"\\cite.*{.*tom.*}"
if isinstance(paper_files, str):
paper_files = [paper_files]
# track whether each key is ever used
ever_used = [False for _ in range(len(keys))]
for file in paper_files:
with open(file, "r") as f:
paper_string = f.read()
# check each key in each file
for i, key in enumerate(keys):
# for markdown (JOSS paper) the check is simple
if file.endswith(".md") and f"@{key}" in paper_string:
ever_used[i] = True
# for latex we need to be a little smarter about checking for \cite commands
elif file.endswith(".tex") and re.search(tex_regex.replace("tom", re.escape(key)), paper_string) is not None:
ever_used[i] = True
return ever_used
def isolate_bibtex_entry(s, start):
# isolate a bibtex entry based on closing curly braces
braces, cursor, not_opened = 0, start, True
while braces > 0 or not_opened:
if s[cursor] == "{":
braces += 1
not_opened = False
elif s[cursor] == "}":
braces -= 1
cursor += 1
return s[start: cursor]
def create_min_bib_file(bib_string, keys, keys_used, out_file):
min_bib_string = ""
# keep @strings (useful for ACL Anthology)
for line in bib_string.split('\n'):
x = line.strip()
if x.startswith('@string{'):
min_bib_string += x + '\n'
for k, used in zip(keys, keys_used):
if used:
key_regex = r"@\w*{" + re.escape(k) + r"(?=\,)"
match = re.search(key_regex, bib_string)
min_bib_string += isolate_bibtex_entry(bib_string, match.start()) + "\n\n"
with open(out_file, "w") as f:
f.write(min_bib_string)
def remove_unused_bibs(bib_files, paper_files, out_file):
keys, bib_string = collect_all_bibs(bib_files=bib_files)
keys_used = check_bibs_used(paper_files=paper_files, keys=keys)
create_min_bib_file(bib_string=bib_string, keys=keys, keys_used=keys_used, out_file=out_file)
def main():
parser = argparse.ArgumentParser(description='Remove unused bibtex entries',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-b', '--bib_files', default="paper.bib", type=str,
help="Comma-separated list of bib files")
parser.add_argument('-p', '--paper_files', default="paper.tex", type=str,
help="Comma-separated list of paper files")
parser.add_argument('-o', '--out_file', default="min.bib", type=str,
help="Output path for the minimal bib file")
args = parser.parse_args()
bib_files = args.bib_files.split(",")
paper_files = args.paper_files.split(",")
remove_unused_bibs(bib_files=bib_files, paper_files=paper_files, out_file=args.out_file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment