Last active
August 15, 2019 05:15
-
-
Save jeremyorme/4fb29a16d1ff534f5f75f65071e74cc5 to your computer and use it in GitHub Desktop.
Simple text classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# records a type classification | |
class classification: | |
def __init__(self, start_token, end_token, type): | |
self.start_token = start_token | |
self.end_token = end_token | |
self.type = type | |
def __repr__(self): | |
return '{' + 's:' + str(self.start_token) + ', e:' + str(self.end_token) + ', t:' + self.type + '}' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from rule import * | |
class classifier: | |
# convert knowledge string to structured format | |
def parse(knowledge): | |
rules = [] | |
for line in knowledge.splitlines(): | |
if len(line) > 0: | |
rules.append(rule(line)) | |
return rules | |
# classify tokens using supplied knowledge | |
def classify(tokens, rules): | |
classifications = [] | |
for rule in rules: | |
rule.match(tokens, classifications) | |
return classifications |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from utils import * | |
from classifier import * | |
in_str = '6 – 7 cups of Three different types of vegetables*, chopped into bite-sized pieces' | |
knowledge = ''' | |
/\d+/ is number | |
number,/-|–/,number is range | |
/tbsp/ is unit | |
/cups?/ is unit | |
range|number,unit,/of/? is amount | |
amount,/\w+/+ is ,ingredient | |
''' | |
print('input:\n' + in_str) | |
tokens = tokenise(in_str) | |
print('\ntokens:\n' + str(tokens)) | |
rules = classifier.parse(knowledge) | |
classifications = classifier.classify(tokens, rules) | |
print('\nclassifications:\n' + str(classifications)) | |
out_str = mark_up(tokens, classifications) | |
print('\noutput:\n' + out_str) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# matches parts of the input string | |
class matcher: | |
def __init__(self, is_type): | |
self.is_type = is_type | |
# matches single token with a regex pattern | |
class pattern_matcher(matcher): | |
fmt = '\\/(?:\\\\\\\\|\\\\\\/|[^\\/])+\\/[\+\*\?]?' | |
def __init__(self, pattern, is_type): | |
super(pattern_matcher, self).__init__(is_type) | |
self.pattern = pattern[1:-1] if pattern[-1] == '/' else pattern[1:-2] | |
self.quantifier = pattern[-1] | |
def match(self, token, idx, types): | |
return re.match(self.pattern, token) is not None | |
def __repr__(self): | |
return '/' + self.pattern + '/' | |
# matches a single token with a type | |
class type_matcher(matcher): | |
fmt = '[\w\|-]+' | |
def __init__(self, types, is_type): | |
super(type_matcher, self).__init__(is_type) | |
self.types = types.split('|') | |
self.quantifier = '+' | |
def match(self, token, idx, types): | |
for t in types: | |
if t.start_token <= idx and t.end_token >= idx and t.type in self.types: | |
return True | |
return False | |
def __repr__(self): | |
return '|'.join(self.types) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from matchers import * | |
from classification import * | |
# defines a rule for matching text in the input string | |
class rule: | |
def __init__(self, rule_str): | |
# store the rule string for debugging | |
self.rule_str = rule_str | |
# regex of allowed matcher formats | |
fmts = '(?:' + pattern_matcher.fmt + '|' + type_matcher.fmt + ')' | |
# split around the 'is' keyword | |
m = re.match('(' + fmts + '(?:,' + fmts + ')*)\s+is\s+(.*)', rule_str) | |
# check rule was valid | |
if m is None: | |
raise Exception('unknown rule: ' + rule_str) | |
# extract the list of match expressions | |
match_exprs = re.findall(fmts, m.group(1)) | |
# extract the list of type classifications | |
is_types = m.group(2).split(',') | |
# set the rule is-type, if applicable | |
self.is_type = is_types[0] if len(is_types) == 1 else None | |
# check match expressions and types line up | |
if len(is_types) > 1 and len(match_exprs) != len(is_types): | |
raise Exception('match, type mismatch') | |
# for each match expression | |
self.matchers = [] | |
for i in range(len(match_exprs)): | |
# determine matcher is-type | |
is_type = self.is_type or is_types[i] | |
if match_exprs[i].startswith('/'): | |
# if expression is regex, build pattern matcher | |
self.matchers.append(pattern_matcher(match_exprs[i], is_type)) | |
else: | |
# otherwise, build type matcher | |
self.matchers.append(type_matcher(match_exprs[i], is_type)) | |
def match(self, tokens, types): | |
# for each token | |
i = 0 | |
while i < len(tokens): | |
new_types = [] | |
# reset number of matched tokens for rule | |
n = 0 | |
# reset match count for matcher | |
k = 0 | |
# for each matcher | |
j = 0 | |
while j < len(self.matchers): | |
matcher = self.matchers[j] | |
# calculate token index, t | |
t = i + n + k | |
# match token using current matcher | |
if matcher.match(tokens[t], t, types): | |
# if is-type is given | |
if len(matcher.is_type) > 0: | |
# if is-type per matcher | |
if self.is_type is None: | |
# if first match for matcher | |
if k == 0: | |
# create classification | |
new_types.append(classification(t, t, matcher.is_type)) | |
else: | |
# update classification | |
new_types[-1].end_token = t | |
else: | |
# if first match for rule | |
if j == 0 and k == 0: | |
# create classification | |
new_types.append(classification(t, t, self.is_type)) | |
else: | |
# update classification | |
new_types[-1].end_token = t | |
# if can only match one token | |
if matcher.quantifier in ['/', '?']: | |
# next matcher | |
j += 1 | |
# increment match count for rule | |
n += 1 | |
else: | |
# next match with current matcher | |
k += 1 | |
# if end then break | |
if i + n + k == len(tokens): | |
# add matcher match count to rule match count | |
n += k | |
break | |
else: | |
# if first match attempt and need match | |
if k == 0 and matcher.quantifier in ['+', '/']: | |
# reset matched types and break | |
new_types = [] | |
break | |
else: | |
# next matcher | |
j += 1 | |
# add matcher match count to rule match count | |
n += k | |
# reset matcher match count | |
k = 0 | |
# if rule matched | |
if len(new_types) > 0: | |
# add types from this rule to the type store | |
for new_type in new_types: | |
types.append(new_type) | |
# add rule match count to start index | |
i += n | |
else: | |
# next start point | |
i += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
# convert input string to token list | |
def tokenise(in_str): | |
return [t for t in re.split('([a-zA-Z][a-zA-Z\\-]*|\\d+|[^\\w ])', in_str) if t.strip() != ''] | |
# mark-up tokens with classifications | |
def mark_up(tokens, classifications): | |
str = '' | |
types = [] | |
before_close_len = 0 | |
for i in range(len(tokens)): | |
before_open_len = len(types) | |
# write opening tag for each classification starting on this token index | |
def by_end(c): | |
return c.end_token | |
sorted_cls = classifications.copy() | |
sorted_cls.sort(key=by_end, reverse=True) | |
for classification in sorted_cls: | |
if classification.start_token == i: | |
str += '<' + classification.type + '>' | |
types.append(classification.type) | |
# if no new opening tags, separate with space | |
if len(types) == before_close_len and len(types) == before_open_len: | |
str += ' ' | |
# write the token content | |
str += tokens[i] | |
# try to close each open tag (most recent first), stopping at the first one that isn't ended at this token | |
before_close_len = len(types) | |
while len(types) > 0: | |
found = False | |
for classification in classifications: | |
if classification.end_token == i and classification.type == types[-1]: | |
str += '</' + types.pop() + '>' | |
found = True | |
if len(types) == 0: | |
break | |
if found == False: | |
break | |
return str |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment