jeremyorme/classification.py

## classification.py
# records a type classification
class classification:
	def __init__(self, start_token, end_token, type):
		self.start_token = start_token
		self.end_token = end_token
		self.type = type

	def __repr__(self):
		return '{' + 's:' + str(self.start_token) + ', e:' + str(self.end_token) + ', t:' + self.type + '}'

## classifier.py
from rule import *

class classifier:

	# convert knowledge string to structured format
	def parse(knowledge):
		rules = []
		for line in knowledge.splitlines():
			if len(line) > 0:
				rules.append(rule(line))
		return rules

	# classify tokens using supplied knowledge
	def classify(tokens, rules):
		classifications = []

		for rule in rules:
			rule.match(tokens, classifications)

		return classifications

## main.py
from utils import *
from classifier import *

in_str = '6 – 7 cups of Three different types of vegetables*, chopped into bite-sized pieces'

knowledge = '''
/\d+/ is number
number,/-|–/,number is range
/tbsp/ is unit
/cups?/ is unit
range|number,unit,/of/? is amount
amount,/\w+/+ is ,ingredient
'''

print('input:\n' + in_str)

tokens = tokenise(in_str)

print('\ntokens:\n' + str(tokens))

rules = classifier.parse(knowledge)
classifications = classifier.classify(tokens, rules)

print('\nclassifications:\n' + str(classifications))

out_str = mark_up(tokens, classifications)

print('\noutput:\n' + out_str)

## matchers.py
import re

# matches parts of the input string
class matcher:
	def __init__(self, is_type):
		self.is_type = is_type

# matches single token with a regex pattern
class pattern_matcher(matcher):

	fmt = '\\/(?:\\\\\\\\|\\\\\\/|[^\\/])+\\/[\+\*\?]?'

	def __init__(self, pattern, is_type):
		super(pattern_matcher, self).__init__(is_type)
		self.pattern = pattern[1:-1] if pattern[-1] == '/' else pattern[1:-2]
		self.quantifier = pattern[-1]

	def match(self, token, idx, types):
		return re.match(self.pattern, token) is not None

	def __repr__(self):
		return '/' + self.pattern + '/'

# matches a single token with a type
class type_matcher(matcher):

	fmt = '[\w\|-]+'

	def __init__(self, types, is_type):
		super(type_matcher, self).__init__(is_type)
		self.types = types.split('|')
		self.quantifier = '+'

	def match(self, token, idx, types):
		for t in types:
			if t.start_token <= idx and t.end_token >= idx and t.type in self.types:
				return True
		return False

	def __repr__(self):
		return '|'.join(self.types)

## rule.py
import re
from matchers import *
from classification import *

# defines a rule for matching text in the input string
class rule:
	def __init__(self, rule_str):

		# store the rule string for debugging
		self.rule_str = rule_str

		# regex of allowed matcher formats
		fmts = '(?:' + pattern_matcher.fmt + '|' + type_matcher.fmt + ')'

		# split around the 'is' keyword
		m = re.match('(' + fmts + '(?:,' + fmts + ')*)\s+is\s+(.*)', rule_str)

		# check rule was valid
		if m is None:
			raise Exception('unknown rule: ' + rule_str)

		# extract the list of match expressions
		match_exprs = re.findall(fmts, m.group(1))

		# extract the list of type classifications
		is_types = m.group(2).split(',')

		# set the rule is-type, if applicable
		self.is_type = is_types[0] if len(is_types) == 1 else None

		# check match expressions and types line up
		if len(is_types) > 1 and len(match_exprs) != len(is_types):
			raise Exception('match, type mismatch')

		# for each match expression
		self.matchers = []
		for i in range(len(match_exprs)):

			# determine matcher is-type
			is_type = self.is_type or is_types[i]

			if match_exprs[i].startswith('/'):

				# if expression is regex, build pattern matcher
				self.matchers.append(pattern_matcher(match_exprs[i], is_type))

			else:

				# otherwise, build type matcher
				self.matchers.append(type_matcher(match_exprs[i], is_type))

	def match(self, tokens, types):

		# for each token
		i = 0
		while i < len(tokens):

			new_types = []

			# reset number of matched tokens for rule
			n = 0

			# reset match count for matcher
			k = 0

			# for each matcher
			j = 0
			while j < len(self.matchers):
				matcher = self.matchers[j]

				# calculate token index, t
				t = i + n + k

				# match token using current matcher
				if matcher.match(tokens[t], t, types):

					# if is-type is given
					if len(matcher.is_type) > 0:

						# if is-type per matcher
						if self.is_type is None:

							# if first match for matcher
							if k == 0:

								# create classification
								new_types.append(classification(t, t, matcher.is_type))

							else:

								# update classification
								new_types[-1].end_token = t

						else:

							# if first match for rule
							if j == 0 and k == 0:

								# create classification
								new_types.append(classification(t, t, self.is_type))

							else:

								# update classification
								new_types[-1].end_token = t

					# if can only match one token
					if matcher.quantifier in ['/', '?']:

						# next matcher
						j += 1

						# increment match count for rule
						n += 1

					else:

						# next match with current matcher
						k += 1

					# if end then break
					if i + n + k == len(tokens):

						# add matcher match count to rule match count
						n += k

						break

				else:

					# if first match attempt and need match
					if k == 0 and matcher.quantifier in ['+', '/']:

						# reset matched types and break
						new_types = []
						break

					else:

						# next matcher
						j += 1

						# add matcher match count to rule match count
						n += k

						# reset matcher match count
						k = 0

			# if rule matched
			if len(new_types) > 0:

				# add types from this rule to the type store
				for new_type in new_types:
					types.append(new_type)

				# add rule match count to start index
				i += n

			else:
				# next start point
				i += 1

## utils.py
import re

# convert input string to token list
def tokenise(in_str):
	return [t for t in re.split('([a-zA-Z][a-zA-Z\\-]*|\\d+|[^\\w ])', in_str) if t.strip() != '']

# mark-up tokens with classifications
def mark_up(tokens, classifications):
	str = ''
	types = []
	before_close_len = 0
	for i in range(len(tokens)):
		before_open_len = len(types)

		# write opening tag for each classification starting on this token index
		def by_end(c):
			return c.end_token
		sorted_cls = classifications.copy()
		sorted_cls.sort(key=by_end, reverse=True)
		for classification in sorted_cls:
			if classification.start_token == i:
				str += '<' + classification.type + '>'
				types.append(classification.type)

		# if no new opening tags, separate with space
		if len(types) == before_close_len and len(types) == before_open_len:
			str += ' '

		# write the token content
		str += tokens[i]

		# try to close each open tag (most recent first), stopping at the first one that isn't ended at this token
		before_close_len = len(types)
		while len(types) > 0:
			found = False
			for classification in classifications:
				if classification.end_token == i and classification.type == types[-1]:
					str += '</' + types.pop() + '>'
					found = True
					if len(types) == 0:
						break
			if found == False:
				break

	return str
	# records a type classification
	class classification:
	def __init__(self, start_token, end_token, type):
	self.start_token = start_token
	self.end_token = end_token
	self.type = type

	def __repr__(self):
	return '{' + 's:' + str(self.start_token) + ', e:' + str(self.end_token) + ', t:' + self.type + '}'
	from rule import *

	class classifier:

	# convert knowledge string to structured format
	def parse(knowledge):
	rules = []
	for line in knowledge.splitlines():
	if len(line) > 0:
	rules.append(rule(line))
	return rules

	# classify tokens using supplied knowledge
	def classify(tokens, rules):
	classifications = []

	for rule in rules:
	rule.match(tokens, classifications)

	return classifications
	from utils import *
	from classifier import *

	in_str = '6 – 7 cups of Three different types of vegetables*, chopped into bite-sized pieces'

	knowledge = '''
	/\d+/ is number
	number,/-\|–/,number is range
	/tbsp/ is unit
	/cups?/ is unit
	range\|number,unit,/of/? is amount
	amount,/\w+/+ is ,ingredient
	'''

	print('input:\n' + in_str)

	tokens = tokenise(in_str)

	print('\ntokens:\n' + str(tokens))

	rules = classifier.parse(knowledge)
	classifications = classifier.classify(tokens, rules)

	print('\nclassifications:\n' + str(classifications))

	out_str = mark_up(tokens, classifications)

	print('\noutput:\n' + out_str)
	import re

	# matches parts of the input string
	class matcher:
	def __init__(self, is_type):
	self.is_type = is_type

	# matches single token with a regex pattern
	class pattern_matcher(matcher):

	fmt = '\\/(?:\\\\\\\\\|\\\\\\/\|[^\\/])+\\/[\+\*\?]?'

	def __init__(self, pattern, is_type):
	super(pattern_matcher, self).__init__(is_type)
	self.pattern = pattern[1:-1] if pattern[-1] == '/' else pattern[1:-2]
	self.quantifier = pattern[-1]

	def match(self, token, idx, types):
	return re.match(self.pattern, token) is not None

	def __repr__(self):
	return '/' + self.pattern + '/'

	# matches a single token with a type
	class type_matcher(matcher):

	fmt = '[\w\\|-]+'

	def __init__(self, types, is_type):
	super(type_matcher, self).__init__(is_type)
	self.types = types.split('\|')
	self.quantifier = '+'

	def match(self, token, idx, types):
	for t in types:
	if t.start_token <= idx and t.end_token >= idx and t.type in self.types:
	return True
	return False

	def __repr__(self):
	return '\|'.join(self.types)
	import re
	from matchers import *
	from classification import *

	# defines a rule for matching text in the input string
	class rule:
	def __init__(self, rule_str):

	# store the rule string for debugging
	self.rule_str = rule_str

	# regex of allowed matcher formats
	fmts = '(?:' + pattern_matcher.fmt + '\|' + type_matcher.fmt + ')'

	# split around the 'is' keyword
	m = re.match('(' + fmts + '(?:,' + fmts + '))\s+is\s+(.)', rule_str)

	# check rule was valid
	if m is None:
	raise Exception('unknown rule: ' + rule_str)

	# extract the list of match expressions
	match_exprs = re.findall(fmts, m.group(1))

	# extract the list of type classifications
	is_types = m.group(2).split(',')

	# set the rule is-type, if applicable
	self.is_type = is_types[0] if len(is_types) == 1 else None

	# check match expressions and types line up
	if len(is_types) > 1 and len(match_exprs) != len(is_types):
	raise Exception('match, type mismatch')

	# for each match expression
	self.matchers = []
	for i in range(len(match_exprs)):

	# determine matcher is-type
	is_type = self.is_type or is_types[i]

	if match_exprs[i].startswith('/'):

	# if expression is regex, build pattern matcher
	self.matchers.append(pattern_matcher(match_exprs[i], is_type))

	else:

	# otherwise, build type matcher
	self.matchers.append(type_matcher(match_exprs[i], is_type))

	def match(self, tokens, types):

	# for each token
	i = 0
	while i < len(tokens):

	new_types = []

	# reset number of matched tokens for rule
	n = 0

	# reset match count for matcher
	k = 0

	# for each matcher
	j = 0
	while j < len(self.matchers):
	matcher = self.matchers[j]

	# calculate token index, t
	t = i + n + k

	# match token using current matcher
	if matcher.match(tokens[t], t, types):

	# if is-type is given
	if len(matcher.is_type) > 0:

	# if is-type per matcher
	if self.is_type is None:

	# if first match for matcher
	if k == 0:

	# create classification
	new_types.append(classification(t, t, matcher.is_type))

	else:

	# update classification
	new_types[-1].end_token = t

	else:

	# if first match for rule
	if j == 0 and k == 0:

	# create classification
	new_types.append(classification(t, t, self.is_type))

	else:

	# update classification
	new_types[-1].end_token = t

	# if can only match one token
	if matcher.quantifier in ['/', '?']:

	# next matcher
	j += 1

	# increment match count for rule
	n += 1

	else:

	# next match with current matcher
	k += 1

	# if end then break
	if i + n + k == len(tokens):

	# add matcher match count to rule match count
	n += k

	break

	else:

	# if first match attempt and need match
	if k == 0 and matcher.quantifier in ['+', '/']:

	# reset matched types and break
	new_types = []
	break

	else:

	# next matcher
	j += 1

	# add matcher match count to rule match count
	n += k

	# reset matcher match count
	k = 0

	# if rule matched
	if len(new_types) > 0:

	# add types from this rule to the type store
	for new_type in new_types:
	types.append(new_type)

	# add rule match count to start index
	i += n

	else:
	# next start point
	i += 1
	import re

	# convert input string to token list
	def tokenise(in_str):
	return [t for t in re.split('([a-zA-Z][a-zA-Z\\-]*\|\\d+\|[^\\w ])', in_str) if t.strip() != '']

	# mark-up tokens with classifications
	def mark_up(tokens, classifications):
	str = ''
	types = []
	before_close_len = 0
	for i in range(len(tokens)):
	before_open_len = len(types)

	# write opening tag for each classification starting on this token index
	def by_end(c):
	return c.end_token
	sorted_cls = classifications.copy()
	sorted_cls.sort(key=by_end, reverse=True)
	for classification in sorted_cls:
	if classification.start_token == i:
	str += '<' + classification.type + '>'
	types.append(classification.type)

	# if no new opening tags, separate with space
	if len(types) == before_close_len and len(types) == before_open_len:
	str += ' '

	# write the token content
	str += tokens[i]

	# try to close each open tag (most recent first), stopping at the first one that isn't ended at this token
	before_close_len = len(types)
	while len(types) > 0:
	found = False
	for classification in classifications:
	if classification.end_token == i and classification.type == types[-1]:
	str += '</' + types.pop() + '>'
	found = True
	if len(types) == 0:
	break
	if found == False:
	break

	return str