Last active
April 15, 2021 12:14
-
-
Save PtrMan/9312c68f6b32ca1376da0b5519aa1c93 to your computer and use it in GitHub Desktop.
NLP AI experiment which I did in 2012
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# TODO< tests fuer den classifier algo > | |
from MiniClasses import * | |
from Classifier import * | |
class AiEngine: | |
def __init__(self): | |
self.Words = [] | |
self.Words.append("this") # 0 | |
self.Words.append("that") # 1 | |
self.Words.append("is") # 2 | |
self.Words.append("not") # 3 | |
self.Sentences = [] | |
# load and parse animaldata | |
AnimalCsv = CsvReader() | |
AnimalCsv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv") | |
self.AnimalData = [] | |
# animal : animal names(s) | |
# [ [word indices] ... ] | |
# young : young name(s) | |
# [ [word indices] ... ] | |
# female : female name(s) | |
# [ [word indices] ... ] | |
# male : male name(s) | |
# [ [word indices] ... ] | |
def indiceText(Raw): | |
Return = [] | |
SplitedByComma = Raw.split(",") | |
for Phrase in SplitedByComma: | |
WordIndices = [] | |
Words = Phrase.split(" ") | |
for Word in Words: | |
Index = self.getIndexForWord(Word) | |
WordIndices.append(Index) | |
Return.append(WordIndices) | |
return Return | |
for Dataset in AnimalCsv.getLines(): | |
NewAnimalData = {"animal":indiceText(Dataset[0]), | |
"young":indiceText(Dataset[1]), | |
"female":indiceText(Dataset[2]), | |
"male":indiceText(Dataset[3]), | |
"group":indiceText(Dataset[4])} | |
self.AnimalData.append(NewAnimalData) | |
print(self.Words[:30]) | |
# wird must be lower | |
def getIndexForWord(self, Word): | |
VWordIndex = None | |
if not Word in self.Words: | |
self.Words.append(Word) | |
VWordIndex = len(self.Words)-1 | |
else: | |
VWordIndex = self.Words.index(Word) | |
return VWordIndex | |
def addSentence(self, Sentence): | |
TempSentence = [] | |
for Object in Sentence: | |
if Object.getType() == 0: # if it is a word | |
# check if the word is allready in the Words list | |
LowerWord = Object.getText().lower() | |
VWordIndex = self.getIndexForWord(LowerWord) | |
TempSentence.append(WordIndex(VWordIndex)) | |
elif Object.getType() == 5: # if it is an comma | |
TempSentence.append(Comma()) | |
else: | |
# error | |
# TODO< throw something > | |
pass | |
self.Sentences.append(TempSentence) | |
# searches for the pattern | |
# This|That (***) is (not) ###. | |
# the sentence have to consists only by WordIndex objects and some punctation | |
def patternMatch0(self, Sentence): | |
SecoundArgumentThere = false # is the (***) part there? | |
ActualIndex = 0 | |
if len(Sentence) < 2: | |
# TODO | |
return | |
if Sentence[0].getType() != 6: # first is no word | |
# TODO | |
return | |
if Sentence[0].getIndex() != 0 and Sentence[0].getIndex() != 1: # if it donsn't start with 'this' or 'that' | |
# TODO | |
return | |
# TODO save if it is this or that | |
# search for is on index 1 or 2 | |
if Sentence[1].getType() != 6: # second is no word | |
# TODO | |
return | |
# try to match against This|That is ... | |
if Sentence[1].getIndex() == 2: # if it is the word 'is' | |
pass | |
else: | |
# try to match against This|That *** is ... | |
if Sentence[2].getType() != 6: # third is no word | |
# TODO | |
return | |
if Sentence[2].getIndex() == 2: # if it is the word 'is' | |
SecoundArgumentThere = true | |
else: | |
# we didn't match anything | |
# TODO | |
return | |
if SecoundArgumentThere: | |
ActualIndex = 3 | |
else: | |
ActualIndex = 2 | |
if len(Sentence) < ActualIndex+1: # is this right? | |
# error, sentence is too short | |
# TODO | |
pass | |
# now we search for 'not' | |
# TODO | |
#are | |
#can | |
class Wordprocessor: | |
def __init__(self): | |
pass | |
# TODO< allow return of None? > | |
@staticmethod | |
def splitByWord(Text, PlainWord): | |
ContentBefore = [] | |
i = 0 | |
while i < len(Text): | |
print(Text[i].getText()) | |
if Text[i].getType() == 0 and Text[i].getText() == PlainWord: # if it is a word and this word | |
print("here") | |
ContentBehind = [] | |
j = i+1 | |
while j < len(Text): | |
ContentBehind.append(Text[j]) | |
j += 1 | |
Left = Container() | |
Left.Contains = ContentBefore | |
Right = Container() | |
Right.Contains = ContentBehind | |
Text = Tee(Left, Right, Text[i]) | |
return Text | |
ContentBefore.append(Text[i]) | |
i += 1 | |
# parse a Text | |
@staticmethod | |
def parseText(Text): | |
Letters = "abcdefghijklmnopqrstuvwxyz" | |
BigLetters = Letters.upper() | |
TempWord = "" | |
Return = [] | |
for Sign in Text: | |
if Sign == " ": | |
Return.append(Word(TempWord)) | |
TempWord = "" | |
elif Sign == ".": | |
Return.append(Word(TempWord)) | |
TempWord = "" | |
Return.append(Point()) | |
elif Sign == ",": | |
Return.append(Word(TempWord)) | |
TempWord = "" | |
Return.append(Comma()) | |
else: | |
TempWord += Sign | |
if TempWord != "": | |
Return.append(Word(TempWord)) | |
return Return | |
# split a List with Buckets into setences | |
@staticmethod | |
def splitIntoSentences(List): | |
Return = [] | |
ActualSentence = [] | |
for Element in List: | |
if Element.getType() == 4: # if it is a point | |
Return.append(ActualSentence) | |
ActualSentence = [] | |
else: | |
ActualSentence.append(Element) | |
if len(ActualSentence) != 0: | |
Return.append(ActualSentence) | |
return Return | |
class AiContext: | |
def __init__(self): | |
pass | |
def addNormalText(self, Text): | |
print("Normal Text:" + Text) | |
def addTopicQuote(self, Text): | |
print("Topic quote:" + Text) | |
def addLink(self, Text): | |
print("Link:" + Text) | |
def addHeading(self, Text): | |
print("Heading:" + Text) | |
class TestClassifier(Classifier): | |
def matchFirstWord(self, WordIndex): | |
print("word index"+ str(WordIndex)) | |
if WordIndex == 0: | |
print("return ...") | |
return (True, [[5]]) | |
else: | |
return (False, [[]]) | |
TC = TestClassifier() | |
Result = TC.classifySentence([WordIndex(1), WordIndex(0), WordIndex(6)]) | |
print("result" + str(Result)) | |
while True: | |
pass | |
# TODO 4 tuple db | |
Wp = Wordprocessor() | |
# wolves can reduce the flow of blood near their skin to conserve body heat. | |
A = [Word("wolves"), Word("can"), Word("hi")] | |
#Wp.Text = A | |
#B = Wordprocessor.splitByWord(Wp.Text, "can") | |
#print(B.debug()) | |
TextContainer = Wordprocessor.parseText("wolves can reduce the flow of blood near their skin to conserve body heat") | |
B = Wordprocessor.splitByWord(TextContainer, "can") | |
print(B.debug()) | |
f = open("C:\\users\\rob\\terrorism.txt", "r", encoding='latin-1') | |
Content = f.read() | |
AiContextObject = AiContext() | |
##WikiParserO = WikiParser(AiContextObject) | |
#WikiParserO.parse(Content) | |
### --- | |
Text = "A Hourse is an animal which has four legs. A rabbit is a animal, too." | |
Parsed = Wordprocessor.parseText(Text) | |
Sentences = Wordprocessor.splitIntoSentences(Parsed) | |
OAiEngine = AiEngine() | |
OAiEngine.addSentence(Sentences[0]) | |
#Csv = CsvReader() | |
#Csv.readFile("C:\\users\\rob\\animalNamesRaw-txt.tsv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from MiniClasses import * | |
class Classifier: | |
def __init__(self): | |
pass | |
def classifySentence(self, Sentence): | |
i = 0 | |
while i < len(Sentence): | |
if Sentence[i].getType() == 6: | |
MatchResult = self.matchFirstWord(Sentence[i].getIndex()) | |
print("i " + str(i)) | |
print("outer loop:MatchResult" + str(MatchResult[0])) | |
if MatchResult[0]: | |
print("inner loop: matched") | |
# check if the "Indices of words of the closest matches" is void | |
if len(MatchResult[1]) == 0: | |
return True | |
# check if the words after the word do match anything | |
RemainingPatterns = MatchResult[1] | |
j = i | |
print(RemainingPatterns) | |
while True: | |
j += 1 | |
print("j=" + str(j)) | |
RemRemPatterns = [] # remaining remaining patterns | |
if j == len(Sentence): | |
# sentence is not long enougth | |
print("break out") | |
break | |
if Sentence[j].getType() != 6: | |
break | |
ActualWordIndex = Sentence[j].getIndex() | |
for Pattern in RemainingPatterns: | |
if Pattern[0] == ActualWordIndex: | |
RemRemPatterns.append(Pattern[1:]) | |
print("remrempatterns:"+ str(RemRemPatterns)) | |
if len(RemRemPatterns) == 0: | |
# no remaining pattern has matched, so the search was unsuccessful | |
break | |
AllLengthZero = True | |
for Pattern in RemRemPatterns: | |
if len(Pattern) != 0: | |
AllLengthZero = False | |
break | |
print("AllLengthZero " + str(AllLengthZero)) | |
if AllLengthZero: | |
# at least one pattern has matched | |
return True | |
RemainingPatterns = RemRemPatterns | |
i += 1 | |
return False | |
# need to be overwritten by subclass | |
# returns (Boolean: has it matched?, [[Indices of words of the closest matches]]) | |
def matchFirstWord(self, WordIndex): | |
pass | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Bucket: | |
def __init__(self, Type): | |
self.Type = Type | |
def getType(self): | |
return self.Type | |
class Word(Bucket): | |
def __init__(self, Text): | |
Bucket.__init__(self, 0) | |
self.Text = Text | |
def getText(self): | |
return self.Text | |
def debug(self): | |
return self.Text | |
class Container(Bucket): | |
def __init__(self): | |
Bucket.__init__(self, 1) | |
self.Contains = [] | |
def debug(self): | |
ContentText = "" | |
for Content in self.Contains: | |
ContentText = ContentText + Content.debug() + ", " | |
return "[" + ContentText + "]" | |
class Tee(Bucket): | |
def __init__(self, Left, Right, Node): | |
Bucket.__init__(self, 2) | |
self.Left = Left | |
self.Right = Right | |
self.Node = Node | |
def debug(self): | |
return "<" + self.Left.debug() + "|" + self.Node.debug() + "|" + self.Right.debug() + ">" | |
class GeneralTerm: | |
def __init__(self, Contains): | |
Bucket.__init__(self, 3) | |
self.Contains = Contains | |
def getContains(self): | |
return self.Contains | |
class Point(Bucket): | |
def __init__(self): | |
Bucket.__init__(self, 4) | |
def debug(self): | |
return "POINT" | |
class Comma(Bucket): | |
def __init__(self): | |
Bucket.__init__(self, 5) | |
def debug(self): | |
return "COMMA" | |
class WordIndex(Bucket): | |
def __init__(self, Index): | |
Bucket.__init__(self, 6) | |
self.Index = Index | |
def debug(self): | |
return "Wordindex " + str(self.Index) | |
def getIndex(self): | |
return self.Index |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment