Last active
May 3, 2024 01:17
Revisions
-
VTSTech revised this gist
May 3, 2024 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -30,7 +30,7 @@ def simple(target,outstr): return outstr def banner(): print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]") print("GitHub: https://gist.github.com/VTSTech/620f80a878acccf9eb64b34193221a67\n") print("Usage:", script_fn,"-v mode \"word or sentence\"\n") def spc_tok(target): outstr=SpaceTokenizer().tokenize(target) -
VTSTech revised this gist
Mar 14, 2020 . 1 changed file with 20 additions and 30 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ #NLTK Script v0.43 2020-03-14 12:16:31 AM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ @@ -20,34 +20,27 @@ custom_sent_tokenizer = PunktSentenceTokenizer(train_text) posts = nltk.corpus.nps_chat.xml_posts()[:10000] def simple(target,outstr): if (verbose==1): print("[+] Target:", target) print("[+] Output:",end='') print(outstr) print("[+] Complete.") else: return outstr def banner(): print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]") print("GitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n") print("Usage:", script_fn,"-v mode \"word or sentence\"\n") def spc_tok(target): outstr=SpaceTokenizer().tokenize(target) return simple(target,outstr) def sent_tok(target): outstr=sent_tokenize(target) return simple(target,outstr) def word_tok(target): outstr=word_tokenize(target) return simple(target,outstr) def syn_ant(target): #print("[+] wordnet imported.") for syn in wordnet.synsets(target): @@ -160,13 +153,8 @@ def lemma(target,pos): else: return(lemmatizer.lemmatize(target)) def postag(target): outstr=pos_tag(word_tok(target)) return simple(target,outstr) def nltk_download(): print("[+] NLTK Downloader launching...\n") nltk.download() @@ -220,7 +208,9 @@ def main(mode,target,pos): nltk_download() elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn): banner() print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize)") print("word_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)") print("lemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download") else: if (verbose==1): banner() if ("spc_tok" in mode): # expects "a complete sentence." -
VTSTech revised this gist
Mar 14, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -291,7 +291,7 @@ def main(mode,target,pos): banner() print("Too many arguments! Check command line. Use \" to wrap series of words.") quit() elif (sys.argv[x] == "-v") or (totalargs==1 and "VTSTech-NLTK" in script_fn): verbose=1 elif (sys.argv[x] == "spc_tok"): mode=sys.argv[x] -
VTSTech revised this gist
Mar 14, 2020 . 1 changed file with 248 additions and 94 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,7 +1,9 @@ #NLTK Script v0.42 2020-03-13 10:08:47 PM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ #Various functions inspired by code from Natural Language Processing with Python #by Steven Bird, Ewan Klein and Edward Loper - http://www.nltk.org/book/ch01.html import sys, nltk, os, string, random from nltk import pos_tag @@ -12,101 +14,188 @@ synonyms = [] antonyms = [] hypernyms = [] hyponyms = [] train_text = state_union.raw("1999-Clinton.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) posts = nltk.corpus.nps_chat.xml_posts()[:10000] def banner(): print("NLTK Script v0.42 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n") print("Usage:", script_fn,"-v mode \"word or sentence\"\n") def spc_tok(target): #print("[+] SpaceTokenizer imported.") if (verbose==1): print("[+] Target:", target) print("[+] Output:",end='') print(SpaceTokenizer().tokenize(target)) print("[+] Complete.") else: return SpaceTokenizer().tokenize(target) def sent_tok(target): if (verbose==1): print("[+] Target:", target) print("[+] Output:",end='') print(sent_tokenize(target)) print("[+] Complete.") else: return sent_tokenize(target) def word_tok(target): if (verbose==1): print("[+] Target:", target) print("[+] Output:",end='') print(word_tokenize(target)) print("[+] Complete.") else: return word_tokenize(target) def syn_ant(target): #print("[+] wordnet imported.") for syn in wordnet.synsets(target): for l in syn.lemmas(): synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) if (verbose==1): print("[+] Target:", target) temp = wordnet.synsets(target) if (verbose==1): if (len(temp) >= 1): print("[+] Defined:", temp[0].definition()) print("[+] Output:") if (len(set(synonyms))>0): print("Syn:", set(synonyms)) else: print("Syn: Failed to find synonym!") if (len(set(antonyms))>0): print("Ant:", set(antonyms)) else: print("Ant: Failed to find antonym!") print("[+] Complete.") else: if (len(set(synonyms))>0): outstr="Syn:", set(synonyms) else: outstr="Syn: Failed!" if (len(set(antonyms))>0): outstr2="Ant:", set(antonyms) else: outstr2="Ant: Failed!" return outstr,outstr2 def hyperon(target): #print("[+] wordnet imported.") for syn in wordnet.synsets(target): for l in syn.hypernyms(): hypernyms.append(l.name()) for l in syn.hyponyms(): hyponyms.append(l.name()) if (verbose==1): print("[+] Target:", target) temp = wordnet.synsets(target) if (len(temp) >= 1): print("[+] Defined:", temp[0].definition()) print("[+] Output:") if (len(set(hypernyms))>0): print("Hypernyms:", set(hypernyms)) else: print("Hypernyms: Failed to find hypernym!") if (len(set(hyponyms))>0): print("Hyponyms:", set(hyponyms)) else: print("Hyponyms: Failed to find hyponyms!") print("[+] Complete.") else: if (len(set(hypernyms))>0): outstr=("Hyper:", set(hypernyms)) else: outstr=("Hyper: Failed!") if (len(set(hyponyms))>0): outstr=outstr+("Hypon:", set(hyponyms)) else: outstr=outstr+("Hypon: Failed!") return outstr def stem(target): ps = PorterStemmer() if (verbose==1): print("[+] Target:",target) print("[+] Output: ", end='') words = SpaceTokenizer().tokenize(target) for w in words: print(ps.stem(w)) print("[+] Complete.") else: words = SpaceTokenizer().tokenize(target) outstr="" for w in words: outstr=outstr+' '+(ps.stem(w)) return outstr.lstrip() def snowball(target): stemmer = SnowballStemmer("english") if (verbose==1): print("[+] Target:",target) words = SpaceTokenizer().tokenize(target) #print("DEBUG:", len(words)) print("[+] Output: ", end='') for w in words: print(stemmer.stem(w)) print("[+] Complete.") else: words = SpaceTokenizer().tokenize(target) outstr="" for w in words: outstr=outstr+' '+(stemmer.stem(w)) return outstr.lstrip() def lemma(target,pos): lemmatizer = WordNetLemmatizer() #print("[+] WordNetLemmatizer imported.") #pos=a,s,v,r if (verbose==1): print("[+] Target:",target) print("[+] Output: ",end='') if (totalargs == 5): print(lemmatizer.lemmatize(target, pos=pos)) else: print(lemmatizer.lemmatize(target)) print("[+] Complete.") else: if (totalargs == 5): return(lemmatizer.lemmatize(target, pos=pos)) else: return(lemmatizer.lemmatize(target)) def postag(target): if (verbose==1): print("[+] Target:", target) print("[+] Output:", end='') print(pos_tag(SpaceTokenizer().tokenize(target))) print("[+] Complete.") else: return(pos_tag(SpaceTokenizer().tokenize(target))) def nltk_download(): print("[+] NLTK Downloader launching...\n") nltk.download() print("[+] Complete.") def ner(target): tokenized = custom_sent_tokenizer.tokenize(target) outstr="" if (verbose==1): try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) print("[+] Target:", target) print("[+] Output:", end='') print(namedEnt) except Exception as e: print(str(e)) print("[+] Complete.") else: try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) outstr=namedEnt except Exception as e: print(str(e)) return outstr def diag_act(target): def dialogue_act_features(post): features = {} @@ -118,60 +207,125 @@ def dialogue_act_features(post): train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) #print("DEBUG:", nltk.classify.accuracy(classifier, test_set)) if (verbose==1): print("[+] Target:", target) print("[+] Output:", classifier.classify(dialogue_act_features(target))) print("[+] Complete.") else: return classifier.classify(dialogue_act_features(target)) ### def main(mode,target,pos): if (totalargs == 2) and ("nltk_download" in mode): if (verbose==1): banner() nltk_download() elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn): banner() print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download") else: if (verbose==1): banner() if ("spc_tok" in mode): # expects "a complete sentence." if (verbose==1): spc_tok(target) else: print(spc_tok(target)) if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three." if (verbose==1): sent_tok(target) else: print(sent_tok(target)) if ("word_tok" in mode): # expects "a complete sentence." if (verbose==1): word_tok(target) else: print(word_tok(target)) if ("syn_ant" in mode): # expects word if (verbose==1): syn_ant(target) else: print(syn_ant(target)) if ("stem" in mode): # expects word if (verbose==1): stem(target) else: print(stem(target)) if ("lemma" in mode): # expects word (tag) if (verbose==1): lemma(target,pos) else: print(lemma(target,pos)) if ("pos_tag" in mode): # expects "a complete sentence." if (verbose==1): postag(target) else: print(postag(target)) if ("ner" in mode): # expects "a complete sentence." if (verbose==1): ner(target) else: print(ner(target)) if ("diag_act" in mode): # expects "a complete sentence." if (verbose==1): diag_act(target) else: print(diag_act(target)) if ("snow" in mode): # expects "a complete sentence." if (verbose==1): snowball(target) else: print(snowball(target)) if ("hyperon" in mode): # expects "a complete sentence." if (verbose==1): hyperon(target) else: print(hyperon(target)) ### totalargs = len(sys.argv) script_fn = sys.argv[0] mode="" target="" pos="" verbose=0 for x in range(0,totalargs,1): #print("DEBUG:",sys.argv[x]) if (totalargs >= 6): banner() print("Too many arguments! Check command line. Use \" to wrap series of words.") quit() elif (sys.argv[x] == "-v") or (totalargs==1): verbose=1 elif (sys.argv[x] == "spc_tok"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "sent_tok"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "word_tok"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "syn_ant"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "stem"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "lemma"): mode=sys.argv[x] target=sys.argv[x+1] if (totalargs==5): pos=sys.argv[x+2] elif (sys.argv[x] == "pos_tag"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "ner"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "diag_act"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "snow"): mode=sys.argv[x] target=sys.argv[x+1] elif (sys.argv[x] == "hyperon"): mode=sys.argv[x] target=sys.argv[x+1] main(mode,target,pos) -
VTSTech revised this gist
Mar 13, 2020 . 1 changed file with 88 additions and 53 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ #NLTK Script v0.41 2020-03-13 1:17:23 AM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ @@ -17,24 +17,34 @@ posts = nltk.corpus.nps_chat.xml_posts()[:10000] def banner(): print("NLTK Script v0.41 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n") print("Usage:", script_fn,"mode \"word or sentence\"\n") def spc_tok(target): #print("[+] SpaceTokenizer imported.") print("[+] Target:", target) print("[+] Output:",end='') print(SpaceTokenizer().tokenize(target)) print("[+] Complete.") def sent_tok(target): print("[+] Target:", target) print("[+] Output:",end='') print(sent_tokenize(target)) print("[+] Complete.") def word_tok(target): print("[+] Target:", target) print("[+] Output:",end='') print(word_tokenize(target)) print("[+] Complete.") def syn_ant(target): #print("[+] wordnet imported.") for syn in wordnet.synsets(target): for l in syn.lemmas(): synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) print("[+] Target:", target) temp = wordnet.synsets(target) if (len(temp) >= 1): print("[+] Defined:", temp[0].definition()) print("[+] Output:") if (len(set(synonyms))>0): print("Syn:", set(synonyms)) @@ -45,60 +55,59 @@ def syn_ant(): else: print("Ant: Failed to find antonym!") print("[+] Complete.") def stem(target): ps = PorterStemmer() #print("[+] PorterStemmer imported.") print("[+] Target:",target) print("[+] Output: ", end='') words = SpaceTokenizer().tokenize(target) for w in words: print(ps.stem(w)) print("[+] Complete.") def snowball(target): stemmer = SnowballStemmer("english") #print("[+] PorterStemmer imported.") print("[+] Target:",target) print("[+] Output: ", end='') words = SpaceTokenizer().tokenize(target) for w in words: print(stemmer.stem(w)) print("[+] Complete.") def lemma(target,pos): lemmatizer = WordNetLemmatizer() #print("[+] WordNetLemmatizer imported.") #pos=a,s,v,r print("[+] Target:",target) print("[+] Output: ",end='') if (totalargs == 4): print(lemmatizer.lemmatize(target, pos=pos)) else: print(lemmatizer.lemmatize(target)) print("[+] Complete.") def postag(target): #print("[+] pos_tag, SpaceTokenizer imported.") print("[+] Target:", target) print("[+] Output:", end='') print(pos_tag(SpaceTokenizer().tokenize(target))) print("[+] Complete.") def nltk_download(): print("[+] NLTK Downloader launching...\n") nltk.download() print("[+] Complete.") def ner(target): tokenized = custom_sent_tokenizer.tokenize(target) try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) print("[+] Target:", target) print("[+] Output:", end='') print(namedEnt) except Exception as e: print(str(e)) print("[+] Complete.") def diag_act(target): def dialogue_act_features(post): features = {} for word in nltk.word_tokenize(post): @@ -109,34 +118,60 @@ def dialogue_act_features(post): train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) #print("DEBUG:", nltk.classify.accuracy(classifier, test_set)) print("[+] Target:", target) print("[+] Output:", classifier.classify(dialogue_act_features(target))) print("[+] Complete.") ### def main(mode,target,pos): if (totalargs == 2) and ("nltk_download" in mode): banner() nltk_download() elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn): banner() print("Modes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download") else: banner() if ("spc_tok" in mode): # expects "a complete sentence." spc_tok(target) if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three." sent_tok(target) if ("word_tok" in mode): # expects "a complete sentence." word_tok(target) if ("syn_ant" in mode): # expects word syn_ant(target) if ("stem" in mode): # expects word stem(target) if ("lemma" in mode): # expects word (tag) lemma(target,pos) if ("pos_tag" in mode): # expects "a complete sentence." postag(target) if ("ner" in mode): # expects "a complete sentence." ner(target) if ("diag_act" in mode): # expects "a complete sentence." diag_act(target) if ("snow" in mode): # expects "a complete sentence." snowball(target) ### totalargs = len(sys.argv) script_fn = sys.argv[0] mode="" target="" pos="" if (totalargs >= 4): mode = sys.argv[1] target = sys.argv[2] pos = sys.argv[3] main(mode,target,pos) elif (totalargs == 3): mode = sys.argv[1] target = sys.argv[2] main(mode,target,pos) elif (totalargs == 2): mode = sys.argv[1] main(mode,target,pos) elif (totalargs == 1): main(mode,target,pos) else: banner() print("Too many arguments! Check command line. Use \" to wrap sentences.") quit() -
VTSTech revised this gist
Mar 12, 2020 . No changes.There are no files selected for viewing
-
VTSTech revised this gist
Mar 12, 2020 . 1 changed file with 16 additions and 4 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ #NLTK Script v0.4 2020-03-12 3:12:58 PM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ @@ -7,6 +7,7 @@ from nltk import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.stem.snowball import SnowballStemmer from nltk.corpus import wordnet, state_union, brown synonyms = [] @@ -16,7 +17,7 @@ posts = nltk.corpus.nps_chat.xml_posts()[:10000] def banner(): print("NLTK Script v0.4 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n") print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") def spc_tok(): #print("[+] SpaceTokenizer imported.") @@ -53,6 +54,15 @@ def stem(): for w in words: print(ps.stem(w)) print("[+] Complete.") def snowball(): stemmer = SnowballStemmer("english") #print("[+] PorterStemmer imported.") print("[+] Target:",sys.argv[2]) print("[+] Output: ", end='') words = SpaceTokenizer().tokenize(sys.argv[2]) for w in words: print(stemmer.stem(w)) print("[+] Complete.") def lemma(): lemmatizer = WordNetLemmatizer() #print("[+] WordNetLemmatizer imported.") @@ -109,7 +119,7 @@ def main(): nltk_download() elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]): banner() print("Modes:\n\nspc_tok (SpaceTokenizer) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download") else: banner() if ("spc_tok" in sys.argv[1]): # expects "a complete sentence." @@ -118,13 +128,15 @@ def main(): syn_ant() if ("stem" in sys.argv[1]): # expects word stem() if ("lemma" in sys.argv[1]): # expects word (tag) lemma() if ("pos_tag" in sys.argv[1]): # expects "a complete sentence." postag() if ("ner" in sys.argv[1]): # expects "a complete sentence." ner() if ("diag_act" in sys.argv[1]): # expects "a complete sentence." diag_act() if ("snow" in sys.argv[1]): # expects "a complete sentence." snowball() ### main() -
VTSTech revised this gist
Mar 12, 2020 . 1 changed file with 22 additions and 5 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,21 +1,22 @@ #NLTK Script v0.3 2020-03-12 12:32:43 AM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ import sys, nltk, os, string, random from nltk import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.corpus import wordnet, state_union, brown synonyms = [] antonyms = [] train_text = state_union.raw("1999-Clinton.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) posts = nltk.corpus.nps_chat.xml_posts()[:10000] def banner(): print("NLTK Script v0.3 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n") print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") def spc_tok(): #print("[+] SpaceTokenizer imported.") @@ -87,14 +88,28 @@ def ner(): except Exception as e: print(str(e)) print("[+] Complete.") def diag_act(): def dialogue_act_features(post): features = {} for word in nltk.word_tokenize(post): features['contains({})'.format(word.lower())] = True return features featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts] size = int(len(featuresets) * 0.1) train_set, test_set = featuresets[size:], featuresets[:size] classifier = nltk.NaiveBayesClassifier.train(train_set) #print("DEBUG:", nltk.classify.accuracy(classifier, test_set)) print("[+] Target:", sys.argv[2]) print("[+] Output:", classifier.classify(dialogue_act_features(sys.argv[2]))) print("[+] Complete.") ### def main(): if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]): banner() nltk_download() elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]): banner() print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\ndiag_act\nnltk_download") else: banner() if ("spc_tok" in sys.argv[1]): # expects "a complete sentence." @@ -109,5 +124,7 @@ def main(): postag() if ("ner" in sys.argv[1]): # expects "a complete sentence." ner() if ("diag_act" in sys.argv[1]): # expects "a complete sentence." diag_act() ### main() -
VTSTech revised this gist
Mar 12, 2020 . No changes.There are no files selected for viewing
-
VTSTech revised this gist
Mar 12, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -110,4 +110,4 @@ def main(): if ("ner" in sys.argv[1]): # expects "a complete sentence." ner() ### main() -
VTSTech revised this gist
Mar 12, 2020 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -15,7 +15,7 @@ custom_sent_tokenizer = PunktSentenceTokenizer(train_text) def banner(): print("NLTK Script v0.2 03-11-2020\nWritten by VTSTech (veritas@vts-tech.org)\n") print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") def spc_tok(): #print("[+] SpaceTokenizer imported.") -
VTSTech revised this gist
Mar 12, 2020 . 1 changed file with 12 additions and 9 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,5 +1,8 @@ #NLTK Script v0.2 2020-03-11 9:25:03 PM #Written by VTSTech (veritas@vts-tech.org) #Various functions inspired by code from sentdex/pythonprogramming.net #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ import sys, nltk, os, string from nltk import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer @@ -12,7 +15,7 @@ custom_sent_tokenizer = PunktSentenceTokenizer(train_text) def banner(): print("NLTK Script v0.2 03-11-2020\nWritten by veritas@vts-tech.org\n") print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") def spc_tok(): #print("[+] SpaceTokenizer imported.") @@ -94,17 +97,17 @@ def main(): print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download") else: banner() if ("spc_tok" in sys.argv[1]): # expects "a complete sentence." spc_tok() if ("syn_ant" in sys.argv[1]): # expects word syn_ant() if ("stem" in sys.argv[1]): # expects word stem() if ("lemma" in sys.argv[1]): # expects word lemma() if ("pos_tag" in sys.argv[1]): # expects "a complete sentence." postag() if ("ner" in sys.argv[1]): # expects "a complete sentence." ner() ### main() -
VTSTech created this gist
Mar 12, 2020 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,110 @@ #v0.1 2020-03-11 9:12:13 PM #VTSTech (veritas@vts-tech.org) import sys, nltk, os, string from nltk import pos_tag from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.corpus import wordnet, state_union synonyms = [] antonyms = [] train_text = state_union.raw("1999-Clinton.txt") custom_sent_tokenizer = PunktSentenceTokenizer(train_text) def banner(): print("VTSTech-NLTK Script v0.1\nveritas@vts-tech.org 03-11-2020\n") print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") def spc_tok(): #print("[+] SpaceTokenizer imported.") print("[+] Target:", sys.argv[2]) print("[+] Output:",end='') print(SpaceTokenizer().tokenize(sys.argv[2])) print("[+] Complete.") def syn_ant(): #print("[+] wordnet imported.") for syn in wordnet.synsets(sys.argv[2]): for l in syn.lemmas(): synonyms.append(l.name()) if l.antonyms(): antonyms.append(l.antonyms()[0].name()) print("[+] Target:", sys.argv[2]) target = wordnet.synsets(sys.argv[2]) if (len(target) >= 1): print("[+] Defined:", target[0].definition()) print("[+] Output:") if (len(set(synonyms))>0): print("Syn:", set(synonyms)) else: print("Syn: Failed to find synonym!") if (len(set(antonyms))>0): print("Ant:", set(antonyms)) else: print("Ant: Failed to find antonym!") print("[+] Complete.") def stem(): ps = PorterStemmer() #print("[+] PorterStemmer imported.") print("[+] Target:",sys.argv[2]) print("[+] Output: ", end='') words = SpaceTokenizer().tokenize(sys.argv[2]) for w in words: print(ps.stem(w)) print("[+] Complete.") def lemma(): lemmatizer = WordNetLemmatizer() #print("[+] WordNetLemmatizer imported.") #pos=a,s,v,r print("[+] Target:",sys.argv[2]) print("[+] Output: ",end='') if (len(sys.argv) == 4): #print("DEBUG:", sys.argv[3]) print(lemmatizer.lemmatize(sys.argv[2], pos=sys.argv[3])) else: print(lemmatizer.lemmatize(sys.argv[2])) print("[+] Complete.") def postag(): #print("[+] pos_tag, SpaceTokenizer imported.") print("[+] Target:", sys.argv[2]) print("[+] Output:", end='') print(pos_tag(SpaceTokenizer().tokenize(sys.argv[2]))) print("[+] Complete.") def nltk_download(): print("[+] NLTK Downloader launching...\n") nltk.download() print("[+] Complete.") def ner(): tokenized = custom_sent_tokenizer.tokenize(sys.argv[2]) try: for i in tokenized: words = nltk.word_tokenize(i) tagged = nltk.pos_tag(words) namedEnt = nltk.ne_chunk(tagged, binary=True) print("[+] Target:", sys.argv[2]) print("[+] Output:", end='') print(namedEnt) except Exception as e: print(str(e)) print("[+] Complete.") ### def main(): if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]): banner() nltk_download() elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]): banner() print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download") else: banner() if ("spc_tok" in sys.argv[1]): spc_tok() if ("syn_ant" in sys.argv[1]): syn_ant() if ("stem" in sys.argv[1]): stem() if ("lemma" in sys.argv[1]): lemma() if ("pos_tag" in sys.argv[1]): postag() if ("ner" in sys.argv[1]): ner() ### main()