Skip to content

Instantly share code, notes, and snippets.

@hghwng
Created June 9, 2015 04:49
Show Gist options
  • Save hghwng/dd42653e04028861ebb7 to your computer and use it in GitHub Desktop.
Save hghwng/dd42653e04028861ebb7 to your computer and use it in GitHub Desktop.
FSA for Speech and Language Processing 2, 2-3
#!/bin/env python3
g_dbg = False
g_rule = {}
def print_state(state, text):
print('\t[', end='')
for i in state['pre']:
print('{} -> '.format(i), end='')
print('{}] '.format(state['tag']), end='')
for i in range(state['index'], len(text)):
print(text[i] + ' ', end='')
print('')
def search(text, rules):
global g_dbg
text = text.split(' ')
states = []
if g_dbg:
states.append({'index': 0, 'tag': 'start', 'pre': []})
else:
states.append({'index': 0, 'tag': 'start'})
while len(states) > 0:
state = states.pop()
# test whether the destination state is found
if state['tag'] == 'end':
if state['index'] == len(text):
# only succeeds on full match
return True
else:
continue
if state['index'] < 0 and state['index'] >= len(text):
continue
# output debug message on demand
if g_dbg:
print_state(state, text)
# get transitions of current state
if state['tag'] not in rules:
continue
else:
rule = rules[state['tag']]
to_match = text[state['index']]
for transition in rule:
new_state = {'index': -1, 'tag': transition[1]}
if g_dbg:
new_state['pre'] = list(state['pre'])
new_state['pre'].append(state['tag'])
for candedate in transition[0]:
if candedate == '':
# epsilon transition
new_state['index'] = state['index']
elif candedate == to_match:
new_state['index'] = state['index'] + 1
if new_state['index'] != -1 and new_state not in states:
states.insert(0, new_state)
return False
def build_rule():
one_plus = ['one', 'two', 'three', 'four', 'five',
'six', 'seven', 'eight', 'nine']
two_plus = ['two', 'three', 'four', 'five',
'six', 'seven', 'eight', 'nine']
ten_plus = ['eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen']
xx_ty = ['ten', 'twenty', 'thirty', 'forty', 'fifty',
'sixty', 'seventy', 'eighty', 'ninety']
no_suffix = []
no_suffix.extend(one_plus)
no_suffix.extend(ten_plus)
has_suffix = xx_ty
'''
abbr:
c : cent
d : dollar
sg : singular
sg? : singular or plural
pl : plural
hs : has suffix
ns : no suffix
stg : stage
number less than 100:
$no_suffix | ($has_suffix? $one_plus)
'''
rules = {}
rules['start'] = [
[[''], 'd stg0'], # one thousand ...
[[''], 'd stg1'], # five hundred ...
[[''], 'd stg2 sg?'], # fifty one / one
[[''], 'c'], # fifty one / one
]
#
# stage 0: number for thousand
#
rules['d stg0'] = [
[no_suffix, 'd stg0 ns'], # eleven thousand ...
[has_suffix, 'd stg0 hs'], # fifty thousand ...
]
rules['d stg0 hs'] = [
[[''], 'd stg0 ns'], # fifty thousand ...
[one_plus, 'd stg0 ns'], # fifty one thousand ...
]
rules['d stg0 ns'] = [
[['thousand'], 'd stg2 pl'], # one thousand eighty ...
[['thousand'], 'd stg1'], # one thousand five hundred ...
]
#
# stage 1: number for hundred
#
rules['d stg1'] = [
[one_plus, 'd stg1 ns'], # one hundred ...
]
rules['d stg1 ns'] = [
[['hundred'], 'd stg2 pl'], # one hundred ...
]
#
# stage 2 (plural): number less than 100 (for confirmed plural)
#
rules['d stg2 pl'] = [
[[''], 'd pl'], # one hundred
[no_suffix, 'd pl'], # one hundred eleven
[has_suffix, 'd stg2 hs'], # one hundred fifty ...
]
rules['d stg2 hs'] = [
[[''], 'd pl'], # one hundred fifty
[one_plus, 'd pl'], # one hundred fifty one
]
#
# stage 2 (plural or singural): number less than 100 (detect form)
#
rules['d stg2 sg?'] = [
[['one'], 'd sg'], # one
[two_plus, 'd pl'], # five
[ten_plus, 'd pl'], # eleven
[has_suffix, 'd stg2 hs'], # fifty ...
]
#
# accept 'doller'
#
rules['d pl'] = [
[['dollars'], 'end'],
[['dollars'], 'c'],
]
rules['d sg'] = [
[['dollar'], 'end'],
[['dollar'], 'c']
]
#
# recognize conts
#
rules['c'] = [
[[''], 'end'], # probably no cent included
[['one'], 'c sg'],
[two_plus, 'c pl'],
[ten_plus, 'c pl'],
[has_suffix, 'c hs'],
]
rules['c hs'] = [
[[''], 'c pl'],
[one_plus, 'c pl'],
]
rules['c pl'] = [
[['cents'], 'end'],
]
rules['c sg'] = [
[['cent'], 'end'],
]
return rules
def test_one(answer, text):
global g_rule
print('Testing "{}": '.format(text), end='')
if search(text, g_rule) == answer:
print('pass')
else:
print('fail. {} expected'.format(answer))
def test_all():
# 11, 12, ...
test_one(True, 'eleven dollars')
test_one(True, 'nineteen dollars')
# 20, 30, ...
test_one(True, 'fifty dollars')
test_one(True, 'seventy dollars')
test_one(False, 'fifty nineteen dollars')
test_one(False, 'nineteen fifty dollars')
test_one(True, 'fifty six dollars')
# 100 - 999
test_one(True, 'one hundred dollars')
test_one(True, 'two hundred six dollars')
test_one(True, 'nine hundred ninety six dollars')
# 1,000 - 100,000
test_one(True, 'twenty thousand sixty six dollars')
test_one(True, 'sixty one thousand five hundred forty six dollars')
test_one(False, 'thousand five hundred forty six dollars one cent')
# With 'cent'
test_one(True, 'one thousand five hundred forty six dollars '
'twenty one cents')
test_one(True, 'one thousand five hundred forty six dollars one cent')
test_one(False, 'one dollars one cent')
test_one(False, 'one dollar one cents')
test_one(False, 'one dollar one thousand cents')
# Plural vs singular
test_one(True, 'one dollar')
test_one(False, 'one dollars')
test_one(True, 'six cents')
test_one(False, 'six cent')
def main():
global g_rule
g_rule = build_rule()
global g_dbg
if False:
g_dbg = True
test_one(False, 'one dollar one cents')
else:
test_all()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment