adefossez/clean_bib.py

## clean_bib.py
#!/usr/bin/env python3
# Author: Alexandre Défossez, 2020
# This is free and unencumbered software released into the public domain.
# For more information, please refer to <http://unlicense.org/>
"""
Merge multiple bibfiles, remove duplicates and unused references, matching bibtex entries
based on the 'title' field. Rewrite all the .tex files in the current directory
to reflect the elimination of duplicates.
Finally, this will rewrite all the arXiv references to use the @unpublished category.

To use this, go to the main() function where everything is hardcoded and update it
to suite your case :D
IMPORTANT: Make a copy of your files before running this, as this script will
overwrite all the .tex files and I can't garantee it is bug proof.
"""
import glob
import re
import sys


def colorize(text, color):
    """
    Return `text` wrapped in ANSI color code sequence, with the given color.
    `color` should be a string, see the following link for a reference:
    https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences
    """
    code = f"\033[{color}m"
    restore = f"\033[0m"
    return "".join([code, text, restore])

def bold(text):
    """
    Wrap text so as to display it in bold in the terminal.
    """
    return colorize(text, "1")


def fatal(in_bold, *args):
    print(bold(in_bold), *args, file=sys.stderr)
    sys.exit(1)


def consumer(buf):
    """
    Returns a `consume` function over the given string `buf`.
    The returned function `consume` can be repeatidly called with
    a single regex pattern as argument. It will match the pattern
    on the current value of `buf` and if it gets a match, will remove
    the matched prefix from `buf` and return the match. Otherwise, return
    `None` and does not change `buf`.
    """
    def consume(pattern):
        nonlocal buf
        match = re.match(pattern, buf, re.MULTILINE | re.DOTALL)
        if match is None:
            return None
        buf = buf[match.end():]
        return match
    return consume


def consume_block(consume):
    """
    Consume a block of the form `content}` from a bibtex consumer.
    Inner blocks like `a {inner} b}` are also matched and added
    verbatim to the output.
    Note that this function will not match the opening `{`.
    """
    content = ""
    while True:
        match = consume("([^{}]*)([{}])")
        if not match:
            fatal("Missing }", consume(r".{,50}").group(0))
        content += match.group(1)
        if match.group(2) == "{":
            sub = consume_block(consume)
            content += "{" + sub + "}"
        else:
            return content


def parse_bib(bib):
    """
    Parse a bibtex from the string `bib`.
    Return a list of dict, with the following keys:
        - kind: lowercased of what is after the @.
        - alias: the name of the bibtex entry.
        - title, author, year, etc.: any entry in the bibtex is also added.

    Comments are only supported when at the beginning of the line and are
    added as a special entry of kind "comment" and with a key "content"
    containing the value of the comment without the initial '%'.
    Comments can also start with '#'
    """
    content = []
    consume = consumer(bib)
    while bib:
        match = consume(r"\s*^\s*[%#]([^\n]*)$")
        if match:
            content.append({"kind": "comment", "content": match.group(1)})
            continue
        match = consume(r"\s*@(\w+)\{\s*([^\s,]+)")
        if not match:
            break
        entry = {
            "alias": match.group(2),
            "kind": match.group(1).lower(),
        }
        content.append(entry)
        while True:
            match = consume(r"\s*(?:,?\s*(})|,\s*(\w+)\s*=\s*({?))")
            if not match:
                fatal("Parsing error, next chars in buffer:",
                      consume(r".{,50}").group(0))
                sys.exit(1)
            if match.group(1) == "}":
                break
            key = match.group(2)
            if match.group(3) == "{":
                value = "{" + consume_block(consume) + "}"
            else:
                # The value is not surrounded by {},
                match = consume(r"\s*(\w+)")
                if not match:
                    fatal("Missing value for", entry['alias'], key)
                value = match.group(1)
            entry[key.lower()] = value
        consume(r"\s*")
    remaining = consume(r".{,50}").group(0)
    if remaining:
        fatal("Parsing error, unparsed leftovers:", remaining)
    return content


def find_used(root):
    """
    Search all tex files for citation with cite, citet and citep.
    Multiple comma separated citations are supported.
    """
    texs = glob.glob(root + '/**/*.tex', recursive=True)
    out = set([])
    for tex in texs:
        buf = open(tex).read()
        for match in re.finditer(r'\\cite[tp]?{([^}]+)}', buf):
            cites = [i.strip() for i in match.group(1).split(",")]
            out |= set(cites)
    return out


def replace(root, replacements):
    """
    Parse all the .tex files under the root folder, search for cite[tp]
    and apply replacement rules from `replacements`.
    `replacements` should be a dict `old_name: new_name`.
    .tex files are modified inplace.
    """
    texs = glob.glob(root + '/**/*.tex', recursive=True)
    for tex in texs:
        buf = open(tex).read()
        processed = []
        while buf:
            match = re.search(r'\\(cite[tp]?){([^}]+)}', buf)
            if not match:
                break
            processed.append(buf[:match.start()])
            buf = buf[match.end():]
            command = match.group(1)
            cites = match.group(2).split(",")
            ncites = []
            for cite in cites:
                if cite in replacements:
                    cite = replacements[cite]
                ncites.append(cite)
            o = "\\" + command + "{" + ",".join(ncites) + "}"
            processed.append(o)
        processed.append(buf)
        open(tex, "w").write("".join(processed))


def remove_unsused(entries, used):
    removed = []
    for entry in list(entries):
        # only remove things with titles, the rest could be comments
        # or other weird bibtex
        if 'title' in entry and entry['alias'] not in used:
            entries.remove(entry)
            removed.append(entry['alias'])
    return removed


def normalize(title):
    return title.strip("{}").strip().lower()

def is_same(a, b):
    return normalize(a['title']) == normalize(b['title'])


def find_duplicates(entries, used):
    replacements = {}
    for entry in list(entries):
        if 'title' not in entry:
            continue
        if entry['alias'] not in used:
            continue
        found = False
        for candidate in entries:
            if 'title' not in candidate:
                continue
            if is_same(candidate, entry):
                break
        if candidate is not entry:
            replacements[entry['alias']] = candidate['alias']
            entries.remove(entry)
            used.add(candidate['alias'])
    return replacements


def dumps_bib(entries):
    """
    Format the given entries to bibtex.
    """
    out = []
    for entry in entries:
        if entry['kind'] == 'comment':
            out.append('%' + entry['content'] + '\n\n')
            continue
        entry = dict(entry)
        kind = entry.pop('kind')
        alias = entry.pop('alias')
        o = f'@{kind}{{{alias},\n'
        for k, v in entry.items():
            o += f'\t{k}={v},\n'
        o += '}\n\n'
        out.append(o)
    return "".join(out)


def replace_arxiv(entries):
    for entry in entries:
        if 'title' not in entry:
            continue
        number = None
        if entry['kind'] == 'techreport' and entry.get('institution') == '{arXiv}':
            number = entry['number'].strip('{}')
            del entry['institution']
        elif entry['kind'] == 'article' and 'journal' in entry:
            match = re.match(r'{*(?:arXiv )?preprint arXiv:([\d.]+)}*$', entry['journal'])
            if match is not None:
                del entry['journal']
                number = match.group(1)

        if number is not None:
            entry['kind'] = 'unpublished'
            entry['note'] = '{Preprint on arXiv:' + number + '}'

def main():
    # Add all the bib files you ever used
    # Order is important, in case of duplicates, the first alias matching a title
    # will be kept as the final one.
    bibfiles = ["ref_optim.bib", "adam/references.bib", "adabatch/references.bib",
                "ref_audio.bib", "sing/references.bib", "demucs/references.bib"]
    refs = []
    for bibfile in bibfiles:
        entries = parse_bib(open(bibfile).read())
        entries.insert(0, {"kind": "comment", "content": "%" * 79})
        entries.insert(0, {
            "kind": "comment",
            "content": " The following references were extracted from " + bibfile})
        entries.insert(0, {"kind": "comment", "content": "%" * 79})
        for entry in entries:
            entry['_source'] = bibfile
        refs += entries

    # Find all used references in the current folder, and sub directories
    used = find_used(".")
    print(bold("Used references:"), list(used))
    # set up replacement rules for duplicates, also update used aliases
    replacements = find_duplicates(refs, used)
    print(bold("Duplicates found:"))
    for old, new in replacements.items():
        print(old, "->", new)
    # remove all unused entries
    removed = remove_unsused(refs, used)
    print(bold("Removed references:"), removed)

    # Apply replacement rules to all the cite/citet/citep in the current folder
    # This will override the tex files, so please save a copy before use.
    replace(".", replacements)
    # Replace all arxiv references to use the @unpublished kind, with a note
    replace_arxiv(refs)

    # Split back into multiple files based on whatever rule you hardcode.
    refs_optim = []
    refs_audio = []
    for ref in refs:
        source = ref.pop('_source').split('.')[0].split('/')[0]
        if source in ["ref_optim", "adam", "adabatch"]:
            refs_optim.append(ref)
        else:
            refs_audio.append(ref)
    open("clean_audio.bib", "w").write(dumps_bib(refs_audio))
    open("clean_optim.bib", "w").write(dumps_bib(refs_optim))

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# Author: Alexandre Défossez, 2020
	# This is free and unencumbered software released into the public domain.
	# For more information, please refer to <http://unlicense.org/>
	"""
	Merge multiple bibfiles, remove duplicates and unused references, matching bibtex entries
	based on the 'title' field. Rewrite all the .tex files in the current directory
	to reflect the elimination of duplicates.
	Finally, this will rewrite all the arXiv references to use the @unpublished category.

	To use this, go to the main() function where everything is hardcoded and update it
	to suite your case :D
	IMPORTANT: Make a copy of your files before running this, as this script will
	overwrite all the .tex files and I can't garantee it is bug proof.
	"""
	import glob
	import re
	import sys


	def colorize(text, color):
	"""
	Return `text` wrapped in ANSI color code sequence, with the given color.
	`color` should be a string, see the following link for a reference:
	https://stackoverflow.com/questions/4842424/list-of-ansi-color-escape-sequences
	"""
	code = f"\033[{color}m"
	restore = f"\033[0m"
	return "".join([code, text, restore])

	def bold(text):
	"""
	Wrap text so as to display it in bold in the terminal.
	"""
	return colorize(text, "1")


	def fatal(in_bold, *args):
	print(bold(in_bold), *args, file=sys.stderr)
	sys.exit(1)


	def consumer(buf):
	"""
	Returns a `consume` function over the given string `buf`.
	The returned function `consume` can be repeatidly called with
	a single regex pattern as argument. It will match the pattern
	on the current value of `buf` and if it gets a match, will remove
	the matched prefix from `buf` and return the match. Otherwise, return
	`None` and does not change `buf`.
	"""
	def consume(pattern):
	nonlocal buf
	match = re.match(pattern, buf, re.MULTILINE \| re.DOTALL)
	if match is None:
	return None
	buf = buf[match.end():]
	return match
	return consume


	def consume_block(consume):
	"""
	Consume a block of the form `content}` from a bibtex consumer.
	Inner blocks like `a {inner} b}` are also matched and added
	verbatim to the output.
	Note that this function will not match the opening `{`.
	"""
	content = ""
	while True:
	match = consume("([^{}]*)([{}])")
	if not match:
	fatal("Missing }", consume(r".{,50}").group(0))
	content += match.group(1)
	if match.group(2) == "{":
	sub = consume_block(consume)
	content += "{" + sub + "}"
	else:
	return content


	def parse_bib(bib):
	"""
	Parse a bibtex from the string `bib`.
	Return a list of dict, with the following keys:
	- kind: lowercased of what is after the @.
	- alias: the name of the bibtex entry.
	- title, author, year, etc.: any entry in the bibtex is also added.

	Comments are only supported when at the beginning of the line and are
	added as a special entry of kind "comment" and with a key "content"
	containing the value of the comment without the initial '%'.
	Comments can also start with '#'
	"""
	content = []
	consume = consumer(bib)
	while bib:
	match = consume(r"\s^\s[%#]([^\n]*)$")
	if match:
	content.append({"kind": "comment", "content": match.group(1)})
	continue
	match = consume(r"\s@(\w+)\{\s([^\s,]+)")
	if not match:
	break
	entry = {
	"alias": match.group(2),
	"kind": match.group(1).lower(),
	}
	content.append(entry)
	while True:
	match = consume(r"\s(?:,?\s(})\|,\s(\w+)\s=\s*({?))")
	if not match:
	fatal("Parsing error, next chars in buffer:",
	consume(r".{,50}").group(0))
	sys.exit(1)
	if match.group(1) == "}":
	break
	key = match.group(2)
	if match.group(3) == "{":
	value = "{" + consume_block(consume) + "}"
	else:
	# The value is not surrounded by {},
	match = consume(r"\s*(\w+)")
	if not match:
	fatal("Missing value for", entry['alias'], key)
	value = match.group(1)
	entry[key.lower()] = value
	consume(r"\s*")
	remaining = consume(r".{,50}").group(0)
	if remaining:
	fatal("Parsing error, unparsed leftovers:", remaining)
	return content


	def find_used(root):
	"""
	Search all tex files for citation with cite, citet and citep.
	Multiple comma separated citations are supported.
	"""
	texs = glob.glob(root + '/*/.tex', recursive=True)
	out = set([])
	for tex in texs:
	buf = open(tex).read()
	for match in re.finditer(r'\\cite[tp]?{([^}]+)}', buf):
	cites = [i.strip() for i in match.group(1).split(",")]
	out \|= set(cites)
	return out


	def replace(root, replacements):
	"""
	Parse all the .tex files under the root folder, search for cite[tp]
	and apply replacement rules from `replacements`.
	`replacements` should be a dict `old_name: new_name`.
	.tex files are modified inplace.
	"""
	texs = glob.glob(root + '/*/.tex', recursive=True)
	for tex in texs:
	buf = open(tex).read()
	processed = []
	while buf:
	match = re.search(r'\\(cite[tp]?){([^}]+)}', buf)
	if not match:
	break
	processed.append(buf[:match.start()])
	buf = buf[match.end():]
	command = match.group(1)
	cites = match.group(2).split(",")
	ncites = []
	for cite in cites:
	if cite in replacements:
	cite = replacements[cite]
	ncites.append(cite)
	o = "\\" + command + "{" + ",".join(ncites) + "}"
	processed.append(o)
	processed.append(buf)
	open(tex, "w").write("".join(processed))


	def remove_unsused(entries, used):
	removed = []
	for entry in list(entries):
	# only remove things with titles, the rest could be comments
	# or other weird bibtex
	if 'title' in entry and entry['alias'] not in used:
	entries.remove(entry)
	removed.append(entry['alias'])
	return removed


	def normalize(title):
	return title.strip("{}").strip().lower()

	def is_same(a, b):
	return normalize(a['title']) == normalize(b['title'])


	def find_duplicates(entries, used):
	replacements = {}
	for entry in list(entries):
	if 'title' not in entry:
	continue
	if entry['alias'] not in used:
	continue
	found = False
	for candidate in entries:
	if 'title' not in candidate:
	continue
	if is_same(candidate, entry):
	break
	if candidate is not entry:
	replacements[entry['alias']] = candidate['alias']
	entries.remove(entry)
	used.add(candidate['alias'])
	return replacements


	def dumps_bib(entries):
	"""
	Format the given entries to bibtex.
	"""
	out = []
	for entry in entries:
	if entry['kind'] == 'comment':
	out.append('%' + entry['content'] + '\n\n')
	continue
	entry = dict(entry)
	kind = entry.pop('kind')
	alias = entry.pop('alias')
	o = f'@{kind}{{{alias},\n'
	for k, v in entry.items():
	o += f'\t{k}={v},\n'
	o += '}\n\n'
	out.append(o)
	return "".join(out)


	def replace_arxiv(entries):
	for entry in entries:
	if 'title' not in entry:
	continue
	number = None
	if entry['kind'] == 'techreport' and entry.get('institution') == '{arXiv}':
	number = entry['number'].strip('{}')
	del entry['institution']
	elif entry['kind'] == 'article' and 'journal' in entry:
	match = re.match(r'{(?:arXiv )?preprint arXiv:([\d.]+)}$', entry['journal'])
	if match is not None:
	del entry['journal']
	number = match.group(1)

	if number is not None:
	entry['kind'] = 'unpublished'
	entry['note'] = '{Preprint on arXiv:' + number + '}'

	def main():
	# Add all the bib files you ever used
	# Order is important, in case of duplicates, the first alias matching a title
	# will be kept as the final one.
	bibfiles = ["ref_optim.bib", "adam/references.bib", "adabatch/references.bib",
	"ref_audio.bib", "sing/references.bib", "demucs/references.bib"]
	refs = []
	for bibfile in bibfiles:
	entries = parse_bib(open(bibfile).read())
	entries.insert(0, {"kind": "comment", "content": "%" * 79})
	entries.insert(0, {
	"kind": "comment",
	"content": " The following references were extracted from " + bibfile})
	entries.insert(0, {"kind": "comment", "content": "%" * 79})
	for entry in entries:
	entry['_source'] = bibfile
	refs += entries

	# Find all used references in the current folder, and sub directories
	used = find_used(".")
	print(bold("Used references:"), list(used))
	# set up replacement rules for duplicates, also update used aliases
	replacements = find_duplicates(refs, used)
	print(bold("Duplicates found:"))
	for old, new in replacements.items():
	print(old, "->", new)
	# remove all unused entries
	removed = remove_unsused(refs, used)
	print(bold("Removed references:"), removed)

	# Apply replacement rules to all the cite/citet/citep in the current folder
	# This will override the tex files, so please save a copy before use.
	replace(".", replacements)
	# Replace all arxiv references to use the @unpublished kind, with a note
	replace_arxiv(refs)

	# Split back into multiple files based on whatever rule you hardcode.
	refs_optim = []
	refs_audio = []
	for ref in refs:
	source = ref.pop('_source').split('.')[0].split('/')[0]
	if source in ["ref_optim", "adam", "adabatch"]:
	refs_optim.append(ref)
	else:
	refs_audio.append(ref)
	open("clean_audio.bib", "w").write(dumps_bib(refs_audio))
	open("clean_optim.bib", "w").write(dumps_bib(refs_optim))

	if __name__ == "__main__":
	main()