PtrMan/Classifier.py

## ai1.py
# TODO< tests fuer den classifier algo >

from MiniClasses import *
from Classifier import *


class AiEngine:
	def __init__(self):
		self.Words = []

		self.Words.append("this") # 0
		self.Words.append("that") # 1
		self.Words.append("is")   # 2
		self.Words.append("not")  # 3

		self.Sentences = []

		# load and parse animaldata
		AnimalCsv = CsvReader()
		AnimalCsv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")

		self.AnimalData = []
		# animal : animal names(s)
		#          [ [word indices] ... ]

		# young : young name(s)
		#          [ [word indices] ... ]

		# female : female name(s)
		#          [ [word indices] ... ]

		# male : male name(s)
		#          [ [word indices] ... ]

		def indiceText(Raw):
			Return = []

			SplitedByComma = Raw.split(",")

			for Phrase in SplitedByComma:
				WordIndices = []

				Words = Phrase.split(" ")

				for Word in Words:
					Index = self.getIndexForWord(Word)

					WordIndices.append(Index)

				Return.append(WordIndices)

			return Return

		for Dataset in AnimalCsv.getLines():
			NewAnimalData = {"animal":indiceText(Dataset[0]),
			                 "young":indiceText(Dataset[1]),
							 "female":indiceText(Dataset[2]),
							 "male":indiceText(Dataset[3]),
							 "group":indiceText(Dataset[4])}

			self.AnimalData.append(NewAnimalData)

		print(self.Words[:30])


	# wird must be lower
	def getIndexForWord(self, Word):
		VWordIndex = None

		if not Word in self.Words:
			self.Words.append(Word)

			VWordIndex = len(self.Words)-1
		else:
			VWordIndex = self.Words.index(Word)

		return VWordIndex

	def addSentence(self, Sentence):
		TempSentence = []

		for Object in Sentence:
			if Object.getType() == 0: # if it is a word

				# check if the word is allready in the Words list

				LowerWord = Object.getText().lower()

				VWordIndex = self.getIndexForWord(LowerWord)

				TempSentence.append(WordIndex(VWordIndex))
			elif Object.getType() == 5: # if it is an comma
				TempSentence.append(Comma())
			else:
				# error

				# TODO< throw something >
				pass

		self.Sentences.append(TempSentence)

	# searches for the pattern
	# This|That (***) is (not) ###.

	# the sentence have to consists only by WordIndex objects and some punctation
	def patternMatch0(self, Sentence):
		SecoundArgumentThere = false # is the (***) part there?
		ActualIndex = 0

		if len(Sentence) < 2:
			# TODO
			return

		if Sentence[0].getType() != 6: # first is no word
			# TODO
			return

		if Sentence[0].getIndex() != 0 and Sentence[0].getIndex() != 1: # if it donsn't start with 'this' or 'that'
			# TODO
			return

		# TODO save if it is this or that


		# search for is on index 1 or 2
		if Sentence[1].getType() != 6: # second is no word
			# TODO
			return

		# try to match against This|That is ...
		if Sentence[1].getIndex() == 2: # if it is the word 'is'
			pass
		else:
			# try to match against This|That *** is ...

			if Sentence[2].getType() != 6: # third is no word
				# TODO
				return

			if Sentence[2].getIndex() == 2: # if it is the word 'is'
				SecoundArgumentThere = true
			else:
				# we didn't match anything

				# TODO
				return


		if SecoundArgumentThere:
			ActualIndex = 3
		else:
			ActualIndex = 2

		if len(Sentence) < ActualIndex+1: # is this right?
			# error, sentence is too short

			# TODO
			pass

		# now we search for 'not'

		# TODO
#are
#can

class Wordprocessor:
	def __init__(self):
		pass

	# TODO< allow return of None? >

	@staticmethod
	def splitByWord(Text, PlainWord):

		ContentBefore = []
		i = 0
		while i < len(Text):
			print(Text[i].getText())

			if Text[i].getType() == 0 and Text[i].getText() == PlainWord: # if it is a word and this word
				print("here")

				ContentBehind = []
				j = i+1
				while j < len(Text):
					ContentBehind.append(Text[j])
					j += 1

				Left = Container()
				Left.Contains = ContentBefore

				Right = Container()
				Right.Contains = ContentBehind

				Text = Tee(Left, Right, Text[i])
				return Text

			ContentBefore.append(Text[i])
			i += 1

	# parse a Text
	@staticmethod
	def parseText(Text):
		Letters = "abcdefghijklmnopqrstuvwxyz"
		BigLetters = Letters.upper()

		TempWord = ""

		Return = []

		for Sign in Text:
			if Sign == " ":
				Return.append(Word(TempWord))
				TempWord = ""
			elif Sign == ".":
				Return.append(Word(TempWord))
				TempWord = ""

				Return.append(Point())
			elif Sign == ",":
				Return.append(Word(TempWord))
				TempWord = ""

				Return.append(Comma())

			else:
				TempWord += Sign

		if TempWord != "":
			Return.append(Word(TempWord))

		return Return

	# split a List with Buckets into setences
	@staticmethod
	def splitIntoSentences(List):
		Return = []

		ActualSentence = []

		for Element in List:
			if Element.getType() == 4: # if it is a point
				Return.append(ActualSentence)
				ActualSentence = []
			else:
				ActualSentence.append(Element)

		if len(ActualSentence) != 0:
			Return.append(ActualSentence)

		return Return


class AiContext:
	def __init__(self):
		pass

	def addNormalText(self, Text):
		print("Normal Text:" + Text)

	def addTopicQuote(self, Text):
		print("Topic quote:" + Text)

	def addLink(self, Text):
		print("Link:" + Text)

	def addHeading(self, Text):
		print("Heading:" + Text)


class TestClassifier(Classifier):
	def matchFirstWord(self, WordIndex):
		print("word index"+ str(WordIndex))

		if WordIndex == 0:
			print("return ...")

			return (True, [[5]])
		else:
			return (False, [[]])

TC = TestClassifier()
Result = TC.classifySentence([WordIndex(1), WordIndex(0), WordIndex(6)])

print("result" + str(Result))


while True:
	pass


# TODO 4 tuple db


Wp = Wordprocessor()

# wolves can reduce the flow of blood near their skin to conserve body heat.

A = [Word("wolves"), Word("can"), Word("hi")]

#Wp.Text = A

#B = Wordprocessor.splitByWord(Wp.Text, "can")
#print(B.debug())

TextContainer = Wordprocessor.parseText("wolves can reduce the flow of blood near their skin to conserve body heat")
B = Wordprocessor.splitByWord(TextContainer, "can")

print(B.debug())

f = open("C:\\users\\rob\\terrorism.txt", "r", encoding='latin-1')
Content = f.read()

AiContextObject = AiContext()
##WikiParserO = WikiParser(AiContextObject)
#WikiParserO.parse(Content)

### ---

Text = "A Hourse is an animal which has four legs. A rabbit is a animal, too."

Parsed = Wordprocessor.parseText(Text)
Sentences = Wordprocessor.splitIntoSentences(Parsed)

OAiEngine = AiEngine()
OAiEngine.addSentence(Sentences[0])

#Csv = CsvReader()
#Csv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")

## Classifier.py
from MiniClasses import *


class Classifier:
	def __init__(self):
		pass

	def classifySentence(self, Sentence):
		i = 0

		while i < len(Sentence):
			if Sentence[i].getType() == 6:
				MatchResult = self.matchFirstWord(Sentence[i].getIndex())

				print("i " + str(i))

				print("outer loop:MatchResult" + str(MatchResult[0]))

				if MatchResult[0]:
					print("inner loop: matched")

					# check if the "Indices of words of the closest matches" is void
					if len(MatchResult[1]) == 0:
						return True

					# check if the words after the word do match anything

					RemainingPatterns = MatchResult[1]
					j = i

					print(RemainingPatterns)

					while True:
						j += 1

						print("j=" + str(j))

						RemRemPatterns = [] # remaining remaining patterns

						if j == len(Sentence):
							# sentence is not long enougth
							print("break out")

							break

						if Sentence[j].getType() != 6:
							break

						ActualWordIndex = Sentence[j].getIndex()


						for Pattern in RemainingPatterns:
							if Pattern[0] == ActualWordIndex:
								RemRemPatterns.append(Pattern[1:])

						print("remrempatterns:"+ str(RemRemPatterns))

						if len(RemRemPatterns) == 0:
							# no remaining pattern has matched, so the search was unsuccessful
							break

						AllLengthZero = True

						for Pattern in RemRemPatterns:
							if len(Pattern) != 0:
								AllLengthZero = False
								break

						print("AllLengthZero " + str(AllLengthZero))

						if AllLengthZero:
							# at least one pattern has matched
							return True

						RemainingPatterns = RemRemPatterns
			i += 1

		return False

	# need to be overwritten by subclass

	# returns (Boolean: has it matched?, [[Indices of words of the closest matches]])
	def matchFirstWord(self, WordIndex):
		pass


## MiniClasses.py
class Bucket:
	def __init__(self, Type):
		self.Type = Type

	def getType(self):
		return self.Type

class Word(Bucket):
	def __init__(self, Text):
		Bucket.__init__(self, 0)
		self.Text = Text

	def getText(self):
		return self.Text

	def debug(self):
		return self.Text

class Container(Bucket):
	def __init__(self):
		Bucket.__init__(self, 1)
		self.Contains = []

	def debug(self):
		ContentText = ""

		for Content in self.Contains:
			ContentText = ContentText + Content.debug() + ", "

		return "[" + ContentText + "]"

class Tee(Bucket):
	def __init__(self, Left, Right, Node):
		Bucket.__init__(self, 2)
		self.Left = Left
		self.Right = Right
		self.Node = Node

	def debug(self):
		return "<" + self.Left.debug() + "|" + self.Node.debug() + "|" +  self.Right.debug() + ">"

class GeneralTerm:
	def __init__(self, Contains):
		Bucket.__init__(self, 3)
		self.Contains = Contains

	def getContains(self):
		return self.Contains

class Point(Bucket):
	def __init__(self):
		Bucket.__init__(self, 4)

	def debug(self):
		return "POINT"

class Comma(Bucket):
	def __init__(self):
		Bucket.__init__(self, 5)

	def debug(self):
		return "COMMA"

class WordIndex(Bucket):
	def __init__(self, Index):
		Bucket.__init__(self, 6)

		self.Index = Index

	def debug(self):
		return "Wordindex " + str(self.Index)

	def getIndex(self):
		return self.Index
	# TODO< tests fuer den classifier algo >

	from MiniClasses import *
	from Classifier import *




	class AiEngine:
	def __init__(self):
	self.Words = []

	self.Words.append("this") # 0
	self.Words.append("that") # 1
	self.Words.append("is") # 2
	self.Words.append("not") # 3

	self.Sentences = []

	# load and parse animaldata
	AnimalCsv = CsvReader()
	AnimalCsv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")

	self.AnimalData = []
	# animal : animal names(s)
	# [ [word indices] ... ]

	# young : young name(s)
	# [ [word indices] ... ]

	# female : female name(s)
	# [ [word indices] ... ]

	# male : male name(s)
	# [ [word indices] ... ]

	def indiceText(Raw):
	Return = []

	SplitedByComma = Raw.split(",")

	for Phrase in SplitedByComma:
	WordIndices = []

	Words = Phrase.split(" ")

	for Word in Words:
	Index = self.getIndexForWord(Word)

	WordIndices.append(Index)

	Return.append(WordIndices)

	return Return

	for Dataset in AnimalCsv.getLines():
	NewAnimalData = {"animal":indiceText(Dataset[0]),
	"young":indiceText(Dataset[1]),
	"female":indiceText(Dataset[2]),
	"male":indiceText(Dataset[3]),
	"group":indiceText(Dataset[4])}

	self.AnimalData.append(NewAnimalData)

	print(self.Words[:30])


	# wird must be lower
	def getIndexForWord(self, Word):
	VWordIndex = None

	if not Word in self.Words:
	self.Words.append(Word)

	VWordIndex = len(self.Words)-1
	else:
	VWordIndex = self.Words.index(Word)

	return VWordIndex

	def addSentence(self, Sentence):
	TempSentence = []

	for Object in Sentence:
	if Object.getType() == 0: # if it is a word

	# check if the word is allready in the Words list

	LowerWord = Object.getText().lower()

	VWordIndex = self.getIndexForWord(LowerWord)

	TempSentence.append(WordIndex(VWordIndex))
	elif Object.getType() == 5: # if it is an comma
	TempSentence.append(Comma())
	else:
	# error

	# TODO< throw something >
	pass

	self.Sentences.append(TempSentence)

	# searches for the pattern
	# This\|That (***) is (not) ###.

	# the sentence have to consists only by WordIndex objects and some punctation
	def patternMatch0(self, Sentence):
	SecoundArgumentThere = false # is the (***) part there?
	ActualIndex = 0

	if len(Sentence) < 2:
	# TODO
	return

	if Sentence[0].getType() != 6: # first is no word
	# TODO
	return

	if Sentence[0].getIndex() != 0 and Sentence[0].getIndex() != 1: # if it donsn't start with 'this' or 'that'
	# TODO
	return

	# TODO save if it is this or that




	# search for is on index 1 or 2
	if Sentence[1].getType() != 6: # second is no word
	# TODO
	return

	# try to match against This\|That is ...
	if Sentence[1].getIndex() == 2: # if it is the word 'is'
	pass
	else:
	# try to match against This\|That *** is ...

	if Sentence[2].getType() != 6: # third is no word
	# TODO
	return

	if Sentence[2].getIndex() == 2: # if it is the word 'is'
	SecoundArgumentThere = true
	else:
	# we didn't match anything

	# TODO
	return



	if SecoundArgumentThere:
	ActualIndex = 3
	else:
	ActualIndex = 2

	if len(Sentence) < ActualIndex+1: # is this right?
	# error, sentence is too short

	# TODO
	pass

	# now we search for 'not'

	# TODO
	#are
	#can

	class Wordprocessor:
	def __init__(self):
	pass

	# TODO< allow return of None? >

	@staticmethod
	def splitByWord(Text, PlainWord):

	ContentBefore = []
	i = 0
	while i < len(Text):
	print(Text[i].getText())

	if Text[i].getType() == 0 and Text[i].getText() == PlainWord: # if it is a word and this word
	print("here")

	ContentBehind = []
	j = i+1
	while j < len(Text):
	ContentBehind.append(Text[j])
	j += 1

	Left = Container()
	Left.Contains = ContentBefore

	Right = Container()
	Right.Contains = ContentBehind

	Text = Tee(Left, Right, Text[i])
	return Text

	ContentBefore.append(Text[i])
	i += 1

	# parse a Text
	@staticmethod
	def parseText(Text):
	Letters = "abcdefghijklmnopqrstuvwxyz"
	BigLetters = Letters.upper()

	TempWord = ""

	Return = []

	for Sign in Text:
	if Sign == " ":
	Return.append(Word(TempWord))
	TempWord = ""
	elif Sign == ".":
	Return.append(Word(TempWord))
	TempWord = ""

	Return.append(Point())
	elif Sign == ",":
	Return.append(Word(TempWord))
	TempWord = ""

	Return.append(Comma())

	else:
	TempWord += Sign

	if TempWord != "":
	Return.append(Word(TempWord))

	return Return

	# split a List with Buckets into setences
	@staticmethod
	def splitIntoSentences(List):
	Return = []

	ActualSentence = []

	for Element in List:
	if Element.getType() == 4: # if it is a point
	Return.append(ActualSentence)
	ActualSentence = []
	else:
	ActualSentence.append(Element)

	if len(ActualSentence) != 0:
	Return.append(ActualSentence)

	return Return




	class AiContext:
	def __init__(self):
	pass

	def addNormalText(self, Text):
	print("Normal Text:" + Text)

	def addTopicQuote(self, Text):
	print("Topic quote:" + Text)

	def addLink(self, Text):
	print("Link:" + Text)

	def addHeading(self, Text):
	print("Heading:" + Text)


	class TestClassifier(Classifier):
	def matchFirstWord(self, WordIndex):
	print("word index"+ str(WordIndex))

	if WordIndex == 0:
	print("return ...")

	return (True, [[5]])
	else:
	return (False, [[]])

	TC = TestClassifier()
	Result = TC.classifySentence([WordIndex(1), WordIndex(0), WordIndex(6)])

	print("result" + str(Result))


	while True:
	pass


	# TODO 4 tuple db



	Wp = Wordprocessor()

	# wolves can reduce the flow of blood near their skin to conserve body heat.

	A = [Word("wolves"), Word("can"), Word("hi")]

	#Wp.Text = A

	#B = Wordprocessor.splitByWord(Wp.Text, "can")
	#print(B.debug())

	TextContainer = Wordprocessor.parseText("wolves can reduce the flow of blood near their skin to conserve body heat")
	B = Wordprocessor.splitByWord(TextContainer, "can")

	print(B.debug())

	f = open("C:\\users\\rob\\terrorism.txt", "r", encoding='latin-1')
	Content = f.read()

	AiContextObject = AiContext()
	##WikiParserO = WikiParser(AiContextObject)
	#WikiParserO.parse(Content)

	### ---

	Text = "A Hourse is an animal which has four legs. A rabbit is a animal, too."

	Parsed = Wordprocessor.parseText(Text)
	Sentences = Wordprocessor.splitIntoSentences(Parsed)

	OAiEngine = AiEngine()
	OAiEngine.addSentence(Sentences[0])

	#Csv = CsvReader()
	#Csv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")
	from MiniClasses import *


	class Classifier:
	def __init__(self):
	pass

	def classifySentence(self, Sentence):
	i = 0

	while i < len(Sentence):
	if Sentence[i].getType() == 6:
	MatchResult = self.matchFirstWord(Sentence[i].getIndex())

	print("i " + str(i))

	print("outer loop:MatchResult" + str(MatchResult[0]))

	if MatchResult[0]:
	print("inner loop: matched")

	# check if the "Indices of words of the closest matches" is void
	if len(MatchResult[1]) == 0:
	return True

	# check if the words after the word do match anything

	RemainingPatterns = MatchResult[1]
	j = i

	print(RemainingPatterns)

	while True:
	j += 1

	print("j=" + str(j))

	RemRemPatterns = [] # remaining remaining patterns

	if j == len(Sentence):
	# sentence is not long enougth
	print("break out")

	break

	if Sentence[j].getType() != 6:
	break

	ActualWordIndex = Sentence[j].getIndex()



	for Pattern in RemainingPatterns:
	if Pattern[0] == ActualWordIndex:
	RemRemPatterns.append(Pattern[1:])

	print("remrempatterns:"+ str(RemRemPatterns))

	if len(RemRemPatterns) == 0:
	# no remaining pattern has matched, so the search was unsuccessful
	break

	AllLengthZero = True

	for Pattern in RemRemPatterns:
	if len(Pattern) != 0:
	AllLengthZero = False
	break

	print("AllLengthZero " + str(AllLengthZero))

	if AllLengthZero:
	# at least one pattern has matched
	return True

	RemainingPatterns = RemRemPatterns
	i += 1

	return False

	# need to be overwritten by subclass

	# returns (Boolean: has it matched?, [[Indices of words of the closest matches]])
	def matchFirstWord(self, WordIndex):
	pass
	class Bucket:
	def __init__(self, Type):
	self.Type = Type

	def getType(self):
	return self.Type

	class Word(Bucket):
	def __init__(self, Text):
	Bucket.__init__(self, 0)
	self.Text = Text

	def getText(self):
	return self.Text

	def debug(self):
	return self.Text

	class Container(Bucket):
	def __init__(self):
	Bucket.__init__(self, 1)
	self.Contains = []

	def debug(self):
	ContentText = ""

	for Content in self.Contains:
	ContentText = ContentText + Content.debug() + ", "

	return "[" + ContentText + "]"

	class Tee(Bucket):
	def __init__(self, Left, Right, Node):
	Bucket.__init__(self, 2)
	self.Left = Left
	self.Right = Right
	self.Node = Node

	def debug(self):
	return "<" + self.Left.debug() + "\|" + self.Node.debug() + "\|" + self.Right.debug() + ">"

	class GeneralTerm:
	def __init__(self, Contains):
	Bucket.__init__(self, 3)
	self.Contains = Contains

	def getContains(self):
	return self.Contains

	class Point(Bucket):
	def __init__(self):
	Bucket.__init__(self, 4)

	def debug(self):
	return "POINT"

	class Comma(Bucket):
	def __init__(self):
	Bucket.__init__(self, 5)

	def debug(self):
	return "COMMA"

	class WordIndex(Bucket):
	def __init__(self, Index):
	Bucket.__init__(self, 6)

	self.Index = Index

	def debug(self):
	return "Wordindex " + str(self.Index)

	def getIndex(self):
	return self.Index