Created
April 9, 2010 21:08
-
-
Save mccutchen/361586 to your computer and use it in GitHub Desktop.
A rough first translation of Jesse's BNF grammar into pyparsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from string import lowercase, uppercase | |
from pyparsing import (Word, Literal, ZeroOrMore, Regex, Or, CharsNotIn, | |
Optional, CaselessLiteral, Combine, Suppress, | |
srange, alphas, alphanums, nums) | |
def Literals(s, splitchar=None): | |
"""Splits the given string (defaults to splitting on spaces), turns each | |
item into a Literal, and combines them into an Or clause.""" | |
return Or(map(Literal, s.split(splitchar))) | |
ess = Literal('es') | Literal('s') | |
alphanum = Word(alphanums + "',/.-") | |
alpha = Word(alphas + "'-") | |
num = Word(nums) | |
numword = Literals('one two three four five six seven eight nine ten') | |
fractional = Literals('half third fourth fifth sixth') | |
dot = Literal('.') | |
range_ = Literals('to -') | |
comma = Literals(', -') | |
connector = Literal('of') | |
paren = Literals('( )') | |
containers = Combine(Literals('package pk bag can container cn envelope jar') | |
+ Optional(ess)) | |
metric = Literals('kg gram kilogram kg ml liter cc') | |
colloq = Literals("dozen/half-dozen/half dozen/doz/ea/each/x/" | |
"baker's dozen/bakers dozen/","/") | |
relative = Literals('medium-sized medium med/small med large small s lg') | |
genus = Literals('bunch bn head clove ear batch slice stick leg zest rind ' | |
'sprig') | |
imperial = Literals('feet cup c quart qt pint teaspoon tspn tsp tablespoon ' | |
'tbsp tb ts t pound lb ounce oz gallon') | |
mod_amount = Literals('scant generous several about') | |
qualitative = Literals('dash ds pinch pn splash shake') | |
fraction_word = numword + Combine(fractional + Optional(ess)) | |
fraction = Combine(num + '/' + num, adjacent=False) | |
numeral = num + Optional(dot + num) | |
mixed = Combine(numeral + fraction, adjacent=False, joinString=' ') | |
word_mixed = numword + (fraction ^ fraction_word) | |
sub_amount = mixed ^ fraction ^ numeral ^ fraction_word ^ numword | |
amount = sub_amount + Optional(range_ + sub_amount).setResultsName('amount') | |
c1 = ((Optional(paren) + amount) ^ | |
(comma + relative) ^ | |
metric ^ | |
(imperial + Optional(dot)) ^ | |
(comma + containers + Optional(paren))) | |
container = c1 ^ containers | |
measurement = Combine( | |
Optional(mod_amount) + | |
(container ^ relative ^ metric ^ imperial ^ colloq ^ genus ^ qualitative) + | |
Optional(dot) + Optional(ess), adjacent=False).setResultsName('measurement') | |
words = alpha + ZeroOrMore(alpha) | |
words_plus = alphanum + ZeroOrMore(alphanum) | |
paren_comment = "(" + words_plus + ")" | |
comment = paren_comment ^ words_plus | |
thing = (Optional(words) + paren_comment + words) ^ words | |
anything = Regex('.*').setResultsName('anything') | |
list_ = Optional(Literal('*')) ^ Literal('the') | |
fork = Literals("plus or and") | |
suffix = CharsNotIn('\r\n') | |
about = CaselessLiteral('about') | |
type1 = Suppress(Optional(about)) + amount + measurement + anything | |
type2 = measurement + anything | |
type3 = Suppress(Optional(about)) + amount + anything | |
ingredient = Optional(list_) + (type1 ^ type2 ^ type3) | |
print ingredient.parseString('three jars of dozen eggs') | |
############################################################################## | |
# Original grammar by Jesse, from keycore/recipes/utils/parser.py | |
############################################################################## | |
""" | |
<nl> := [\n\r]+ | |
<ts> := [ \t]+ | |
ess := ('es'/'s') | |
alphanum := [A-Za-z',/0-9.-]+ | |
alpha := [A-Za-z'-]+ | |
numword := ('one'/'two'/'three'/'four'/'five'/'six'/'seven'/'eight'/'nine'/'ten') | |
dot := "." | |
range := ('to'/'-') | |
<comma> := [,-]+ | |
connector := 'of' | |
paren := ('('/')') | |
containers := ('package'/'pk'/'bag'/'can'/'container'/'cn'/'envelope'/'jar'),ess? | |
metric := ('kg'/'gram'/'kilogram'/'ml'/'liter'/'cc') | |
colloq := ('dozen'/'half-dozen'/'half dozen'/'bakers dozen'/'doz'/'ea'/'each'/'x') | |
relative := ('medium-sized'/'medium'/'med/small'/'med'/'large'/'small'/'s'/'lg') | |
genus := ('bunch'/'bn'/'head'/'clove'/'ear'/'batch'/'slice'/'stick'/'leg'/'zest'/'rind'/'sprig') | |
imperial := ('feet'/'cup'/'c'/'quart'/'qt'/'pint'/'teaspoon'/'tspn'/'tsp'/'tablespoon'/'tbsp'/'tb'/'ts'/'t'/'pound'/'lb'/'ounce'/'oz'/'gallon' ) | |
mod_amount := ('scant'/'generous'/'several'/'about') | |
qualitative := ('dash'/'ds'/'pinch'/'pn'/'splash'/'shake') | |
fraction_word := numword,ts,('half'/'third'/'forth'/'fifth'/'sixth'),ess? | |
fraction := [0-9]+,"/",[0-9]+ | |
numeral := [0-9]+,(dot,[0-9]+)? | |
mixed := numeral,ts,(fraction/fraction_word) | |
word_mixed := numword,ts,(fraction/fraction_word) | |
sub_amount := (mixed/fraction/numeral/fraction_word/numword) | |
amount := (sub_amount,ts?,range,ts?,sub_amount)/sub_amount | |
c1 := (paren?,amount,ts/comma,relative/metric/imperial,dot?,ts/comma,containers,paren?) | |
#c2 := (containers,ts?,paren,amount,comma/ts,relative/metric/imperial,dot?,paren) | |
container := (c1/containers) | |
measurement := (mod_amount, ts)?,(container/relative/metric/imperial/colloq/genus/qualitative),dot?,ess? | |
words := alpha,(ts,alpha,?-('in'/'or'))* | |
words_plus := alphanum,(ts,alphanum)* | |
paren_comment := "(",ts?,words_plus,ts?,")" | |
comment := paren_comment/words_plus | |
thing := (words?,paren_comment,words)/words | |
any := [.]* | |
<list> := ([*]/'the') | |
fork := ('plus'/'or'/'and') | |
suffix := [^\n\r]+ | |
ingredient := list?,ts?,(type2/type1/type3) | |
type1 := measurement,ts,any | |
type2 := ('about',ts)?,amount,ts?,measurement,ts,any | |
type3 := ('about',ts)?,amount,ts,any | |
""" | |
if __name__ == '__main__': | |
import doctest | |
tests = [ | |
('1 onion', ['1', 'onion']), | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment