Skip to content

Instantly share code, notes, and snippets.

@uchida
Created January 31, 2016 00:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save uchida/90a0fae7a70f815f0b58 to your computer and use it in GitHub Desktop.
Save uchida/90a0fae7a70f815f0b58 to your computer and use it in GitHub Desktop.
a simple BibTeX parser based on finite state machine
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# CC0 dedicated to public domai by Akihiro Uchida
import sys
class BibParser(object):
def __init__(self):
self.parse = self.parse_main
self.buffer = ''
self.item = dict()
self.items = []
self.brace_count = 0
return
def feed(self, text):
i = 0
while i < len(text):
(self.parse, i) = self.parse(text, i)
return
def parse_main(self, text, i):
c = text[i]
if c == '@':
self.item = dict()
return (self.parse_bibtype, i+1)
return (self.parse_main, i+1)
def parse_bibtype(self, text, i):
c = text[i]
if c == '{':
self.item['type'] = self.buffer
self.buffer = ''
return (self.parse_key, i+1)
elif c not in [' ', '\t', '\n']:
self.buffer += c
return (self.parse_bibtype, i+1)
def parse_key(self, text, i):
c = text[i]
if c == ',':
self.item['key'] = self.buffer
self.buffer = ''
return (self.parse_lhs, i+1)
self.buffer += c
return (self.parse_key, i+1)
def parse_lhs(self, text, i):
c = text[i]
if c == '=':
self.key = self.buffer.lower()
self.buffer = ''
return (self.parse_rhs, i+1)
elif c not in [' ', '\t', '\n']:
self.buffer += c
return (self.parse_lhs, i+1)
def parse_rhs(self, text, i):
c = text[i]
if c == '}':
self.buffer = self.buffer.replace('\t', ' ')
if self.key == 'author':
self.item['author'] = author_list(self.buffer)
else:
self.item[self.key] = self.buffer
self.finish_item()
return (self.parse_main, i+1)
elif c == ',':
self.buffer = self.buffer.replace('\t', ' ')
if self.key == 'author':
self.item['author'] = author_list(self.buffer)
else:
self.item[self.key] = self.buffer
self.buffer = ''
return (self.parse_lhs, i+1)
elif c == '{':
return (self.parse_in_brace, i+1)
elif c == '"':
return (self.parse_in_dquote, i+1)
elif c not in [' ', '\t', '\n']:
self.buffer += c
return (self.parse_rhs, i+1)
def parse_in_brace(self, text, i):
c = text[i]
if self.brace_count == 0 and c == '}':
return (self.parse_rhs, i+1)
elif c == '{':
self.brace_count += 1
elif c == '}':
self.brace_count -= 1
else:
self.buffer += c
return (self.parse_in_brace, i+1)
def parse_in_dquote(self, text, i):
c = text[i]
if c == '"':
return (self.parse_rhs, i+1)
self.buffer += c
if c == '\\':
return (self.parse_bs, i+1)
return (self.parse_in_dquote, i+1)
def parse_bs(self, text, i):
c = text[i]
self.buffer += c
if c == ' ':
return (self.parse_dquote, i+1)
return (self.parse_bs, i+1)
def finish_item(self):
self.items.append(self.item)
self.key = ''
self.buffer = ''
return
def finish(self):
self.finish_item()
return self.items
def author_list(author_text):
authors = []
for author in author_text.split(' and '):
if ',' in author:
p = author.partition(',')
authors.append('%s %s' % (p[2], p[0]))
else:
authors.append(author)
return authors
def dump_rst(dic):
separator = ""2 * '\n' + 2 * ' '
fmt_str = '* '
fmt_str += ', '.join(dic['author'])
fmt_str += '**%(title)s**'
fmt_str += separator
reference_info = []
if dic['type'] == 'article':
reference_info = []
reference_info.append('*%(journal)s*')
if 'volume' in dic:
reference_info.append('**%(volume)s**')
if 'number' in dic:
reference_info.append('%(number)s)')
if 'pages' in dic:
reference_info.append('pages. %(pages)s')
if 'annote' in dic:
reference_info.append('(%(annote)s)')
fmt_str += ' '.join(reference_info) + ', ' + '*%(year)s*'
if dic['type'] == 'inproceedings':
reference_info = []
if 'editor' in dic:
reference_info.append('%(editor)s editors')
reference_info.append('*%(booktitle)s*')
if 'pages' in dic:
reference_info.append('pages. %(pages)s, ')
reference_info.append('%(year)s')
if 'annote' in dic:
reference_info.append('(%(annote)s)')
fmt_str += 'In ' + ', '.join(reference_info)
if dic['type'] == 'incollections':
if 'editor' in dic:
reference_info.append('Editors %(editor)s')
reference_info.append('*%(booktitle)s*')
if 'pages' in dic:
reference_info.append('pages. %(pages)s, ')
reference_info.append('%(publisher)s, %(year)s')
reference_info.append('%(year)s')
if 'annote' in dic:
reference_info.append('(%(annote)s)' )
fmt_str += 'In ' + ', '.join(reference_info)
fmt_str += separator
url_info = []
if 'url' in dic:
url_info.append('`URL <%(url)s>`_')
if 'doi' in dic:
url_info.append('`DOI: %(doi)s <http://dx.doi.org/%(doi)s>`_')
if 'arxiv' in dic:
url_info.append('`arXiv:%(arxiv)s <http://arxiv.org/abs/%(arxiv)s>`_')
fmt_str += ', '.join(url_info)
fmt_str += '\n'
print fmt_str % dic
return
if __name__ == '__main__':
f = open(sys.argv[1], 'r')
parser = BibParser()
parser.feed(f.read())
items = parser.finish()
items.sort(key=lambda d: d["year"], reverse=True)
for item in items:
dump_rst(item)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment