Skip to content

Instantly share code, notes, and snippets.

@jeremyorme
Last active August 15, 2019 05:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jeremyorme/4fb29a16d1ff534f5f75f65071e74cc5 to your computer and use it in GitHub Desktop.
Save jeremyorme/4fb29a16d1ff534f5f75f65071e74cc5 to your computer and use it in GitHub Desktop.
Simple text classifier
# records a type classification
class classification:
def __init__(self, start_token, end_token, type):
self.start_token = start_token
self.end_token = end_token
self.type = type
def __repr__(self):
return '{' + 's:' + str(self.start_token) + ', e:' + str(self.end_token) + ', t:' + self.type + '}'
from rule import *
class classifier:
# convert knowledge string to structured format
def parse(knowledge):
rules = []
for line in knowledge.splitlines():
if len(line) > 0:
rules.append(rule(line))
return rules
# classify tokens using supplied knowledge
def classify(tokens, rules):
classifications = []
for rule in rules:
rule.match(tokens, classifications)
return classifications
from utils import *
from classifier import *
in_str = '6 – 7 cups of Three different types of vegetables*, chopped into bite-sized pieces'
knowledge = '''
/\d+/ is number
number,/-|–/,number is range
/tbsp/ is unit
/cups?/ is unit
range|number,unit,/of/? is amount
amount,/\w+/+ is ,ingredient
'''
print('input:\n' + in_str)
tokens = tokenise(in_str)
print('\ntokens:\n' + str(tokens))
rules = classifier.parse(knowledge)
classifications = classifier.classify(tokens, rules)
print('\nclassifications:\n' + str(classifications))
out_str = mark_up(tokens, classifications)
print('\noutput:\n' + out_str)
import re
# matches parts of the input string
class matcher:
def __init__(self, is_type):
self.is_type = is_type
# matches single token with a regex pattern
class pattern_matcher(matcher):
fmt = '\\/(?:\\\\\\\\|\\\\\\/|[^\\/])+\\/[\+\*\?]?'
def __init__(self, pattern, is_type):
super(pattern_matcher, self).__init__(is_type)
self.pattern = pattern[1:-1] if pattern[-1] == '/' else pattern[1:-2]
self.quantifier = pattern[-1]
def match(self, token, idx, types):
return re.match(self.pattern, token) is not None
def __repr__(self):
return '/' + self.pattern + '/'
# matches a single token with a type
class type_matcher(matcher):
fmt = '[\w\|-]+'
def __init__(self, types, is_type):
super(type_matcher, self).__init__(is_type)
self.types = types.split('|')
self.quantifier = '+'
def match(self, token, idx, types):
for t in types:
if t.start_token <= idx and t.end_token >= idx and t.type in self.types:
return True
return False
def __repr__(self):
return '|'.join(self.types)
import re
from matchers import *
from classification import *
# defines a rule for matching text in the input string
class rule:
def __init__(self, rule_str):
# store the rule string for debugging
self.rule_str = rule_str
# regex of allowed matcher formats
fmts = '(?:' + pattern_matcher.fmt + '|' + type_matcher.fmt + ')'
# split around the 'is' keyword
m = re.match('(' + fmts + '(?:,' + fmts + ')*)\s+is\s+(.*)', rule_str)
# check rule was valid
if m is None:
raise Exception('unknown rule: ' + rule_str)
# extract the list of match expressions
match_exprs = re.findall(fmts, m.group(1))
# extract the list of type classifications
is_types = m.group(2).split(',')
# set the rule is-type, if applicable
self.is_type = is_types[0] if len(is_types) == 1 else None
# check match expressions and types line up
if len(is_types) > 1 and len(match_exprs) != len(is_types):
raise Exception('match, type mismatch')
# for each match expression
self.matchers = []
for i in range(len(match_exprs)):
# determine matcher is-type
is_type = self.is_type or is_types[i]
if match_exprs[i].startswith('/'):
# if expression is regex, build pattern matcher
self.matchers.append(pattern_matcher(match_exprs[i], is_type))
else:
# otherwise, build type matcher
self.matchers.append(type_matcher(match_exprs[i], is_type))
def match(self, tokens, types):
# for each token
i = 0
while i < len(tokens):
new_types = []
# reset number of matched tokens for rule
n = 0
# reset match count for matcher
k = 0
# for each matcher
j = 0
while j < len(self.matchers):
matcher = self.matchers[j]
# calculate token index, t
t = i + n + k
# match token using current matcher
if matcher.match(tokens[t], t, types):
# if is-type is given
if len(matcher.is_type) > 0:
# if is-type per matcher
if self.is_type is None:
# if first match for matcher
if k == 0:
# create classification
new_types.append(classification(t, t, matcher.is_type))
else:
# update classification
new_types[-1].end_token = t
else:
# if first match for rule
if j == 0 and k == 0:
# create classification
new_types.append(classification(t, t, self.is_type))
else:
# update classification
new_types[-1].end_token = t
# if can only match one token
if matcher.quantifier in ['/', '?']:
# next matcher
j += 1
# increment match count for rule
n += 1
else:
# next match with current matcher
k += 1
# if end then break
if i + n + k == len(tokens):
# add matcher match count to rule match count
n += k
break
else:
# if first match attempt and need match
if k == 0 and matcher.quantifier in ['+', '/']:
# reset matched types and break
new_types = []
break
else:
# next matcher
j += 1
# add matcher match count to rule match count
n += k
# reset matcher match count
k = 0
# if rule matched
if len(new_types) > 0:
# add types from this rule to the type store
for new_type in new_types:
types.append(new_type)
# add rule match count to start index
i += n
else:
# next start point
i += 1
import re
# convert input string to token list
def tokenise(in_str):
return [t for t in re.split('([a-zA-Z][a-zA-Z\\-]*|\\d+|[^\\w ])', in_str) if t.strip() != '']
# mark-up tokens with classifications
def mark_up(tokens, classifications):
str = ''
types = []
before_close_len = 0
for i in range(len(tokens)):
before_open_len = len(types)
# write opening tag for each classification starting on this token index
def by_end(c):
return c.end_token
sorted_cls = classifications.copy()
sorted_cls.sort(key=by_end, reverse=True)
for classification in sorted_cls:
if classification.start_token == i:
str += '<' + classification.type + '>'
types.append(classification.type)
# if no new opening tags, separate with space
if len(types) == before_close_len and len(types) == before_open_len:
str += ' '
# write the token content
str += tokens[i]
# try to close each open tag (most recent first), stopping at the first one that isn't ended at this token
before_close_len = len(types)
while len(types) > 0:
found = False
for classification in classifications:
if classification.end_token == i and classification.type == types[-1]:
str += '</' + types.pop() + '>'
found = True
if len(types) == 0:
break
if found == False:
break
return str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment