Last active
December 13, 2022 01:44
-
-
Save agostini01/6dc35f978cd97b1a3f172e5740335788 to your computer and use it in GitHub Desktop.
Python script to parse a .bib file and print its @Article and @inpreceedings entries in a human readable format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# Author: Nicolas Bohm Agostini | |
# Date: 2022-Dec-12 | |
# Version: 1.1 | |
# License: MIT | |
# Changelog: | |
# 1.1 - added alternative way of printing | |
# @brief: This script parses a bib file and prints it in a specific format | |
# @usage: python3 parse-bib-file.py <bibfile> | |
# if no bibfile is provided, the script will look for a file named "works.bib" | |
# Dependencies: | |
# create a virtual environment | |
# python3 -m venv venv | |
# source venv/bin/activate | |
# pip install pybtex | |
import os | |
import re | |
from pybtex.database.input import bibtex | |
import argparse | |
# Function to load the data from a bib file into a dictionary | |
def load_bib(bibfile): | |
bib_data = bibtex.Parser().parse_file(bibfile) | |
list_of_entries = bib_data.entries | |
# BibliographyData( | |
# entries=OrderedCaseInsensitiveDict([ | |
# ('agostini2022sodaopt', Entry('inproceedings', | |
# fields=[ | |
# ('bo | |
# oktitle', 'IEEE/ACM International Conference on Computer-Aided Design'), | |
# ('series', "ICCAD'22"), ('title', '{An MLIR-based Compiler Flow for System-Level Design and Hardware Acceleration}'), | |
# ('year', '2022'), | |
# ('volume', ''), | |
# ('number', ''), | |
# ('pages', ''), | |
# ('publisher', 'IEEE'), | |
# ('address', 'San Diego, CA'), | |
# ('doi', '10.1145/3508352.3549424')], | |
# persons=OrderedCaseInsensitiveDict([('author', [Person('Bohm Agostini, Nicolas'), Person('Curzel, Serena'), Person('Amatya, Vinay'), Person('Tan, Cheng'), Person('Minutoli, Marco'), Person('Castellana, Vito Giovanni'), Person('Manzano, Joseph'), Person('Kaeli, David'), Person('Tumeo, Antonino')])])))]), | |
# preamble=[]) | |
# print(bib_data) | |
# OrderedCaseInsensitiveDict([('agostini2022sodaopt', Entry('inproceedings', | |
# fields=[ | |
#('booktitle', 'IEEE/ACM International Conference on Computer-Aided Design'), | |
#('series', "ICCAD'22"), ('title', '{An MLIR-based Compiler Flow for System-Level Design and Hardware Acceleration}'), | |
#('year', '2022'), | |
#('volume', ''), | |
#('number', ''), | |
#('pages', ''), | |
#('publisher', 'IEEE'), | |
#('address', 'San Diego, CA'), | |
# ('doi', '10.1145/3508352.3549424')], | |
# persons=OrderedCaseInsensitiveDict([('author', [Person('Bohm Agostini, Nicolas'), Person('Curzel, Serena'), Person('Amatya, Vinay'), Person('Tan, Cheng'), Person('Minutoli, Marco'), Person('Castellana, Vito Giovanni'), Person('Manzano, Joseph'), Person('Kaeli, David'), Person('Tumeo, Antonino')])])))]) | |
# print(list_of_entries) | |
return bib_data | |
# function find and remove braces from a string at any position | |
# and other special characters: \, {, }, ' | |
def remove_braces(s): | |
return re.sub(r'\{|\}|\\', '', s) | |
def remove_parenthesis(s): | |
return re.sub(r'\(|\)', '', s) | |
# function to remove leading number and spaces from a string | |
def remove_leading_number(s): | |
return re.sub(r'^\d+\s', '', s) | |
# function to remove trailing expression in parenthesis from a string | |
# input "something like this (with parenthesis)" | |
# output "something like this" | |
def remove_trailing_parenthesis(s): | |
return re.sub(r'\s\(.+\)', '', s) | |
def process_title(s): | |
s = remove_braces(s) | |
s = remove_leading_number(s) | |
s = remove_trailing_parenthesis(s) | |
return s | |
# retrieve expression in parenthesis | |
# input: 2021 {IEEE}/{ACM} International Conference On Computer Aided Design ({ICCAD}) | |
# input: a string (ICCAD) | |
# output: (ICCAD) | |
def get_expr_in_parenthesis(s): | |
# if there is no parenthesis, return the string | |
if not re.search(r'\(.+\)', s): | |
return s | |
else: | |
return re.search(r'\(.+\)', s).group() | |
# function to format the journal name | |
# input: 2021 {IEEE}/{ACM} International Conference On Computer Aided Design ({ICCAD}) | |
# output: (ICCAD) | |
def format_journal_name(journal): | |
tmp = get_expr_in_parenthesis(journal) | |
tmp = remove_braces(tmp) | |
tmp = remove_parenthesis(tmp) | |
return tmp | |
# function to transform a number into 1st, 2nd, 3rd, 4th, etc. | |
def ordinal(n): | |
if 10 <= n % 100 < 20: | |
return str(n) + 'th' | |
else: | |
return str(n) + {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, "th") | |
# function join the author name and fix the case. Only the first letter is capitalized | |
def full_name(author): | |
tmp = ' '.join(author.first_names + | |
author.middle_names + author.last_names) | |
return tmp.title() | |
def last_first_name(author): | |
firstname_initials = ['{}. '.format(x[0]) for x in author.first_names] | |
middle_initials = ['{}.'.format(x[0]) for x in author.middle_names] | |
tmp = ' '.join(author.last_names+[',']+firstname_initials+middle_initials) | |
tmp = re.sub(r'\s+', ' ', tmp) | |
tmp = re.sub(r'\.\s', '.', tmp) | |
tmp = re.sub(r'\s+,', ',', tmp) | |
return tmp.title() | |
# function to get the position of an author in the list of authors | |
def get_author_position(author, authors): | |
for i in range(len(authors)): | |
if full_name(authors[i]) == author: | |
return i | |
return -1 | |
# function to print entries of a specific type | |
# must print the following fields for @articles: title, author, year, journal | |
# must print the following fields for @inproceedings: title, author, year, booktitle | |
def print_entries_of_type(bib, entry_type): | |
for entry in bib.entries.values(): | |
if entry.type == entry_type: | |
# print on the same line: | |
print(ordinal(get_author_position('Nicolas Bohm Agostini', | |
entry.persons['author'])+1), | |
'author of', len(entry.persons['author']), | |
'in: ', end='') | |
print(remove_braces(entry.fields['title']), end='. ') | |
for author in entry.persons['author']: | |
author_name = full_name(author) | |
author_name = remove_braces(author_name) | |
if author_name == 'Nicolas Bohm Agostini': | |
# perform last print and break the loop | |
print(author_name, end=', et al. ') | |
break | |
print(author_name, end=', ') | |
print(entry.fields['year'], end='. ') | |
if entry_type == 'article': | |
print(process_title(entry.fields['journal']), end='.') | |
elif entry_type == 'inproceedings': | |
print(process_title(entry.fields['booktitle']), end='.') | |
print() | |
# function to print entries of a specific type | |
# must print the following fields for @articles: title, author, year, journal | |
# must print the following fields for @inproceedings: title, author, year, booktitle | |
# this generates entries with the format: | |
# Paper in ICPE2021: Gutierrez, J., Shi, D., Agostini, N.B., and Kaeli, D., Performance Evaluation and Improvement of Computer Vision Applications on Heterogeneous Edge Computing Devices | |
def print_entries_of_type2(bib, entry_type): | |
for entry in bib.entries.values(): | |
if entry.type == entry_type: | |
# print on the same line: | |
if entry_type == 'article': | |
print('Article in', end=' ') | |
print(format_journal_name(entry.fields['journal']), end='') | |
elif entry_type == 'inproceedings': | |
print('Paper in', end=' ') | |
print(format_journal_name(entry.fields['booktitle']), end='') | |
print(entry.fields['year'], end='. ') | |
count = 0 | |
for author in entry.persons['author']: | |
count += 1 | |
if count > 4: | |
print('et al.', end=', ') | |
break | |
author_name = last_first_name(author) | |
author_name = remove_braces(author_name) | |
print(author_name, end=', ') | |
print(remove_braces(entry.fields['title']), end='.') | |
print() | |
# function to count the number of entries of a given type | |
def count_entries(bib, entry_type): | |
count = 0 | |
for entry in bib.entries.values(): | |
if entry.type == entry_type: | |
count += 1 | |
return count | |
# main function | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('bibfile', nargs='?', default='works.bib') | |
args = parser.parse_args() | |
bibfile = args.bibfile | |
# print full path to file | |
bibfile_path = os.path.abspath(bibfile) | |
print('Processing file:', bibfile_path, '\n') | |
bib = load_bib(bibfile) | |
# print count of journal entries | |
print('Journals: ', count_entries(bib, 'article')) | |
print_entries_of_type(bib, 'article') | |
print() | |
print('Proceedings: ', count_entries(bib, 'inproceedings')) | |
print_entries_of_type(bib, 'inproceedings') | |
# print('Journals: ', count_entries(bib, 'article')) | |
# print_entries_of_type2(bib, 'article') | |
# print() | |
# print('Proceedings: ', count_entries(bib, 'inproceedings')) | |
# print_entries_of_type2(bib, 'inproceedings') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment