Skip to content

Instantly share code, notes, and snippets.

@PtrMan
Last active April 15, 2021 12:14
Show Gist options
  • Save PtrMan/9312c68f6b32ca1376da0b5519aa1c93 to your computer and use it in GitHub Desktop.
Save PtrMan/9312c68f6b32ca1376da0b5519aa1c93 to your computer and use it in GitHub Desktop.
NLP AI experiment which I did in 2012
# TODO< tests fuer den classifier algo >
from MiniClasses import *
from Classifier import *
class AiEngine:
def __init__(self):
self.Words = []
self.Words.append("this") # 0
self.Words.append("that") # 1
self.Words.append("is") # 2
self.Words.append("not") # 3
self.Sentences = []
# load and parse animaldata
AnimalCsv = CsvReader()
AnimalCsv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")
self.AnimalData = []
# animal : animal names(s)
# [ [word indices] ... ]
# young : young name(s)
# [ [word indices] ... ]
# female : female name(s)
# [ [word indices] ... ]
# male : male name(s)
# [ [word indices] ... ]
def indiceText(Raw):
Return = []
SplitedByComma = Raw.split(",")
for Phrase in SplitedByComma:
WordIndices = []
Words = Phrase.split(" ")
for Word in Words:
Index = self.getIndexForWord(Word)
WordIndices.append(Index)
Return.append(WordIndices)
return Return
for Dataset in AnimalCsv.getLines():
NewAnimalData = {"animal":indiceText(Dataset[0]),
"young":indiceText(Dataset[1]),
"female":indiceText(Dataset[2]),
"male":indiceText(Dataset[3]),
"group":indiceText(Dataset[4])}
self.AnimalData.append(NewAnimalData)
print(self.Words[:30])
# wird must be lower
def getIndexForWord(self, Word):
VWordIndex = None
if not Word in self.Words:
self.Words.append(Word)
VWordIndex = len(self.Words)-1
else:
VWordIndex = self.Words.index(Word)
return VWordIndex
def addSentence(self, Sentence):
TempSentence = []
for Object in Sentence:
if Object.getType() == 0: # if it is a word
# check if the word is allready in the Words list
LowerWord = Object.getText().lower()
VWordIndex = self.getIndexForWord(LowerWord)
TempSentence.append(WordIndex(VWordIndex))
elif Object.getType() == 5: # if it is an comma
TempSentence.append(Comma())
else:
# error
# TODO< throw something >
pass
self.Sentences.append(TempSentence)
# searches for the pattern
# This|That (***) is (not) ###.
# the sentence have to consists only by WordIndex objects and some punctation
def patternMatch0(self, Sentence):
SecoundArgumentThere = false # is the (***) part there?
ActualIndex = 0
if len(Sentence) < 2:
# TODO
return
if Sentence[0].getType() != 6: # first is no word
# TODO
return
if Sentence[0].getIndex() != 0 and Sentence[0].getIndex() != 1: # if it donsn't start with 'this' or 'that'
# TODO
return
# TODO save if it is this or that
# search for is on index 1 or 2
if Sentence[1].getType() != 6: # second is no word
# TODO
return
# try to match against This|That is ...
if Sentence[1].getIndex() == 2: # if it is the word 'is'
pass
else:
# try to match against This|That *** is ...
if Sentence[2].getType() != 6: # third is no word
# TODO
return
if Sentence[2].getIndex() == 2: # if it is the word 'is'
SecoundArgumentThere = true
else:
# we didn't match anything
# TODO
return
if SecoundArgumentThere:
ActualIndex = 3
else:
ActualIndex = 2
if len(Sentence) < ActualIndex+1: # is this right?
# error, sentence is too short
# TODO
pass
# now we search for 'not'
# TODO
#are
#can
class Wordprocessor:
def __init__(self):
pass
# TODO< allow return of None? >
@staticmethod
def splitByWord(Text, PlainWord):
ContentBefore = []
i = 0
while i < len(Text):
print(Text[i].getText())
if Text[i].getType() == 0 and Text[i].getText() == PlainWord: # if it is a word and this word
print("here")
ContentBehind = []
j = i+1
while j < len(Text):
ContentBehind.append(Text[j])
j += 1
Left = Container()
Left.Contains = ContentBefore
Right = Container()
Right.Contains = ContentBehind
Text = Tee(Left, Right, Text[i])
return Text
ContentBefore.append(Text[i])
i += 1
# parse a Text
@staticmethod
def parseText(Text):
Letters = "abcdefghijklmnopqrstuvwxyz"
BigLetters = Letters.upper()
TempWord = ""
Return = []
for Sign in Text:
if Sign == " ":
Return.append(Word(TempWord))
TempWord = ""
elif Sign == ".":
Return.append(Word(TempWord))
TempWord = ""
Return.append(Point())
elif Sign == ",":
Return.append(Word(TempWord))
TempWord = ""
Return.append(Comma())
else:
TempWord += Sign
if TempWord != "":
Return.append(Word(TempWord))
return Return
# split a List with Buckets into setences
@staticmethod
def splitIntoSentences(List):
Return = []
ActualSentence = []
for Element in List:
if Element.getType() == 4: # if it is a point
Return.append(ActualSentence)
ActualSentence = []
else:
ActualSentence.append(Element)
if len(ActualSentence) != 0:
Return.append(ActualSentence)
return Return
class AiContext:
def __init__(self):
pass
def addNormalText(self, Text):
print("Normal Text:" + Text)
def addTopicQuote(self, Text):
print("Topic quote:" + Text)
def addLink(self, Text):
print("Link:" + Text)
def addHeading(self, Text):
print("Heading:" + Text)
class TestClassifier(Classifier):
def matchFirstWord(self, WordIndex):
print("word index"+ str(WordIndex))
if WordIndex == 0:
print("return ...")
return (True, [[5]])
else:
return (False, [[]])
TC = TestClassifier()
Result = TC.classifySentence([WordIndex(1), WordIndex(0), WordIndex(6)])
print("result" + str(Result))
while True:
pass
# TODO 4 tuple db
Wp = Wordprocessor()
# wolves can reduce the flow of blood near their skin to conserve body heat.
A = [Word("wolves"), Word("can"), Word("hi")]
#Wp.Text = A
#B = Wordprocessor.splitByWord(Wp.Text, "can")
#print(B.debug())
TextContainer = Wordprocessor.parseText("wolves can reduce the flow of blood near their skin to conserve body heat")
B = Wordprocessor.splitByWord(TextContainer, "can")
print(B.debug())
f = open("C:\\users\\rob\\terrorism.txt", "r", encoding='latin-1')
Content = f.read()
AiContextObject = AiContext()
##WikiParserO = WikiParser(AiContextObject)
#WikiParserO.parse(Content)
### ---
Text = "A Hourse is an animal which has four legs. A rabbit is a animal, too."
Parsed = Wordprocessor.parseText(Text)
Sentences = Wordprocessor.splitIntoSentences(Parsed)
OAiEngine = AiEngine()
OAiEngine.addSentence(Sentences[0])
#Csv = CsvReader()
#Csv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv")
from MiniClasses import *
class Classifier:
def __init__(self):
pass
def classifySentence(self, Sentence):
i = 0
while i < len(Sentence):
if Sentence[i].getType() == 6:
MatchResult = self.matchFirstWord(Sentence[i].getIndex())
print("i " + str(i))
print("outer loop:MatchResult" + str(MatchResult[0]))
if MatchResult[0]:
print("inner loop: matched")
# check if the "Indices of words of the closest matches" is void
if len(MatchResult[1]) == 0:
return True
# check if the words after the word do match anything
RemainingPatterns = MatchResult[1]
j = i
print(RemainingPatterns)
while True:
j += 1
print("j=" + str(j))
RemRemPatterns = [] # remaining remaining patterns
if j == len(Sentence):
# sentence is not long enougth
print("break out")
break
if Sentence[j].getType() != 6:
break
ActualWordIndex = Sentence[j].getIndex()
for Pattern in RemainingPatterns:
if Pattern[0] == ActualWordIndex:
RemRemPatterns.append(Pattern[1:])
print("remrempatterns:"+ str(RemRemPatterns))
if len(RemRemPatterns) == 0:
# no remaining pattern has matched, so the search was unsuccessful
break
AllLengthZero = True
for Pattern in RemRemPatterns:
if len(Pattern) != 0:
AllLengthZero = False
break
print("AllLengthZero " + str(AllLengthZero))
if AllLengthZero:
# at least one pattern has matched
return True
RemainingPatterns = RemRemPatterns
i += 1
return False
# need to be overwritten by subclass
# returns (Boolean: has it matched?, [[Indices of words of the closest matches]])
def matchFirstWord(self, WordIndex):
pass
class Bucket:
def __init__(self, Type):
self.Type = Type
def getType(self):
return self.Type
class Word(Bucket):
def __init__(self, Text):
Bucket.__init__(self, 0)
self.Text = Text
def getText(self):
return self.Text
def debug(self):
return self.Text
class Container(Bucket):
def __init__(self):
Bucket.__init__(self, 1)
self.Contains = []
def debug(self):
ContentText = ""
for Content in self.Contains:
ContentText = ContentText + Content.debug() + ", "
return "[" + ContentText + "]"
class Tee(Bucket):
def __init__(self, Left, Right, Node):
Bucket.__init__(self, 2)
self.Left = Left
self.Right = Right
self.Node = Node
def debug(self):
return "<" + self.Left.debug() + "|" + self.Node.debug() + "|" + self.Right.debug() + ">"
class GeneralTerm:
def __init__(self, Contains):
Bucket.__init__(self, 3)
self.Contains = Contains
def getContains(self):
return self.Contains
class Point(Bucket):
def __init__(self):
Bucket.__init__(self, 4)
def debug(self):
return "POINT"
class Comma(Bucket):
def __init__(self):
Bucket.__init__(self, 5)
def debug(self):
return "COMMA"
class WordIndex(Bucket):
def __init__(self, Index):
Bucket.__init__(self, 6)
self.Index = Index
def debug(self):
return "Wordindex " + str(self.Index)
def getIndex(self):
return self.Index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment