Skip to content

Instantly share code, notes, and snippets.

@erantapaa
Last active October 14, 2021 20:44
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erantapaa/5a2614adde0526d25c03 to your computer and use it in GitHub Desktop.
Save erantapaa/5a2614adde0526d25c03 to your computer and use it in GitHub Desktop.
BibTeX file parsing Python
#
# Simple BibTeX file parsing in python.
#
# See `bibtest1` for an example of usage.
#
# This is a good overview of how to correctly parse a bibtex file:
#
# http://maverick.inria.fr/~Xavier.Decoret/resources/xdkbibtex/bibtex_summary.html
import string
wordLetters = string.ascii_lowercase + string.ascii_uppercase + string.digits
class ParseError(Exception):
def __init__(self, msg):
self.msg = msg
def __str__(self):
return "parse error: " + self.msg
class Tokenizer:
def __init__(self, buf):
self.buf = buf
self.i = 0
self.len = len(buf)
def peek(self):
return self.buf[self.i]
def skipwhite(self):
"""Skip white space"""
i = self.i
while i < self.len and self.buf[i] in string.whitespace:
i += 1
self.i = i
def try_match_string(self, s):
return self.buf[self.i:self.i+len(s)] == s
def try_match_letter(self):
return self.buf[self.i] in wordLetters
def match_string(self, s):
"""Match a literal string and skip following white space"""
if self.buf[self.i:self.i+len(s)] == s:
self.i += len(s)
self.skipwhite()
return True
else:
raise ParseError("expecting " + s)
def match_word(self):
"""Match an identifier and skip following white space"""
j = self.i
while j < self.len and self.buf[j] in wordLetters:
j += 1
s = self.buf[self.i:j]
if len(s) > 0:
self.i = j
self.skipwhite()
return s
else:
raise ParseError("expecting identifier")
def skiptoat(self):
"""Skip to an @ followed by a letter"""
j = self.i
while j < self.len:
if j < self.len-2 and self.buf[j] == '@' and self.buf[j+1] in wordLetters:
self.i = j
return True
j += 1
return None
def skipToEOL(self):
j = self.i
while j < self.len and self.buf[j] <> '\n':
j += 1
self.i = j
def scanString(self):
j = self.i
if self.buf[j] == '"':
s, k = self.scanQuotedString(j+1)
elif self.buf[j] == '{':
s, k = self.scanBraceString(j+1)
else:
raise ParseError("not at a string")
self.i = k
return s
def scanQuotedString(self, j):
"""Returns index of character after ending double-quote"""
s = ""
while j < self.len:
ch = self.buf[j]
if ch == '"':
return s, j+1
elif ch == '{':
t, j = self.scanBraceString(j+1)
s += '{' + t + '}'
else:
s += ch
j += 1
raise ParseError("unterminated double quote string")
def scanBraceString(self, j):
"""Returns index of character following ending brace"""
lvl = 1
k = j
while k < self.len:
ch = self.buf[k]
if ch == '}':
lvl -= 1
if lvl <= 0:
return self.buf[j:k], k+1
elif ch == '{':
lvl += 1
k += 1
raise ParseError("unterminated brace string")
def test1():
t = Tokenizer(" X ")
t.skipwhite()
x = t.match_word()
assert x == "X"
return (True,x)
def test2():
t = Tokenizer(" X yzzy ")
t.skipwhite()
x = t.match_word()
y = t.match_word()
assert x == "X" and y == "yzzy"
return (True,x, y)
def test3():
t = Tokenizer('" { " } "xyz')
s = t.scanString()
w = t.match_word()
assert (s == ' { " } ' and w == "xyz"), (s, w)
return (True, s, w)
# BibTeX parsing routines
def parse_entries(t):
entries = []
while t.skiptoat():
t.match_string('@')
w = t.match_word()
if w.lower() == 'comment':
t.skipToEOL()
continue
ch = t.peek()
if ch not in "{(":
ParseError("expecting either { or (")
t.match_string(ch) # always succeeds
ident = t.match_word()
t.match_string(',')
pairs = parse_kv_pairs(t)
entries.append( (w, ident, pairs) )
# no need to check ending ) or } - skiptoat() will skip over it
# reached EOF
return entries
def parse_kv(t):
key = t.match_word()
t.match_string('=')
vals = []
while True:
ch = t.peek()
if ch == '"' or ch == '{':
v = t.scanString()
vals.append( ("string", v) )
elif ch == '#':
t.match_string('#')
continue
elif ch in wordLetters:
w = t.match_word()
vals.append( ("ident", w) )
else:
break
return (key, vals)
def parse_kv_pairs(t):
pairs = []
while True:
ch = t.peek()
if ch in wordLetters:
kv = parse_kv(t)
pairs.append(kv)
if t.try_match_string(","):
t.match_string(",")
else:
break
return pairs
bib1 = """
% a sample bibliography file
%
@article{small,
author = {Freely, I.P.},
title = {A small paper},
journal = {The journal of small papers},
year = 1997,
volume = {-1},
note = {to appear},
}
@comment this entire line is a comment @foo {
@article(big,
author = {Jass, Hugh},
title = {A big paper},
journal = {The journal of big papers},
year = 7991 # foo,
volume = {MCMXCVII},
)
% The authors mentioned here are almost, but not quite,
% entirely unrelated to Matt Groening.
"""
import pprint
def bibtest1():
t = Tokenizer(bib1)
pp = pprint.PrettyPrinter(indent=4)
r = parse_entries(t)
pp.pprint(r)
@pk1811
Copy link

pk1811 commented Jan 27, 2017

where is the output?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment