Last active
May 3, 2024 01:17
-
-
Save VTSTech/620f80a878acccf9eb64b34193221a67 to your computer and use it in GitHub Desktop.
VTSTech-NLTK Script v0.43
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#NLTK Script v0.43 2020-03-14 12:16:31 AM | |
#Written by VTSTech (veritas@vts-tech.org) | |
#Various functions inspired by code from sentdex/pythonprogramming.net | |
#https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ | |
#Various functions inspired by code from Natural Language Processing with Python | |
#by Steven Bird, Ewan Klein and Edward Loper - http://www.nltk.org/book/ch01.html | |
import sys, nltk, os, string, random | |
from nltk import pos_tag | |
from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
from nltk.stem.snowball import SnowballStemmer | |
from nltk.corpus import wordnet, state_union, brown | |
synonyms = [] | |
antonyms = [] | |
hypernyms = [] | |
hyponyms = [] | |
train_text = state_union.raw("1999-Clinton.txt") | |
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) | |
posts = nltk.corpus.nps_chat.xml_posts()[:10000] | |
def simple(target,outstr): | |
if (verbose==1): | |
print("[+] Target:", target) | |
print("[+] Output:",end='') | |
print(outstr) | |
print("[+] Complete.") | |
else: | |
return outstr | |
def banner(): | |
print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]") | |
print("GitHub: https://gist.github.com/VTSTech/620f80a878acccf9eb64b34193221a67\n") | |
print("Usage:", script_fn,"-v mode \"word or sentence\"\n") | |
def spc_tok(target): | |
outstr=SpaceTokenizer().tokenize(target) | |
return simple(target,outstr) | |
def sent_tok(target): | |
outstr=sent_tokenize(target) | |
return simple(target,outstr) | |
def word_tok(target): | |
outstr=word_tokenize(target) | |
return simple(target,outstr) | |
def syn_ant(target): | |
#print("[+] wordnet imported.") | |
for syn in wordnet.synsets(target): | |
for l in syn.lemmas(): | |
synonyms.append(l.name()) | |
if l.antonyms(): | |
antonyms.append(l.antonyms()[0].name()) | |
if (verbose==1): print("[+] Target:", target) | |
temp = wordnet.synsets(target) | |
if (verbose==1): | |
if (len(temp) >= 1): | |
print("[+] Defined:", temp[0].definition()) | |
print("[+] Output:") | |
if (len(set(synonyms))>0): | |
print("Syn:", set(synonyms)) | |
else: | |
print("Syn: Failed to find synonym!") | |
if (len(set(antonyms))>0): | |
print("Ant:", set(antonyms)) | |
else: | |
print("Ant: Failed to find antonym!") | |
print("[+] Complete.") | |
else: | |
if (len(set(synonyms))>0): | |
outstr="Syn:", set(synonyms) | |
else: | |
outstr="Syn: Failed!" | |
if (len(set(antonyms))>0): | |
outstr2="Ant:", set(antonyms) | |
else: | |
outstr2="Ant: Failed!" | |
return outstr,outstr2 | |
def hyperon(target): | |
#print("[+] wordnet imported.") | |
for syn in wordnet.synsets(target): | |
for l in syn.hypernyms(): | |
hypernyms.append(l.name()) | |
for l in syn.hyponyms(): | |
hyponyms.append(l.name()) | |
if (verbose==1): | |
print("[+] Target:", target) | |
temp = wordnet.synsets(target) | |
if (len(temp) >= 1): print("[+] Defined:", temp[0].definition()) | |
print("[+] Output:") | |
if (len(set(hypernyms))>0): | |
print("Hypernyms:", set(hypernyms)) | |
else: | |
print("Hypernyms: Failed to find hypernym!") | |
if (len(set(hyponyms))>0): | |
print("Hyponyms:", set(hyponyms)) | |
else: | |
print("Hyponyms: Failed to find hyponyms!") | |
print("[+] Complete.") | |
else: | |
if (len(set(hypernyms))>0): | |
outstr=("Hyper:", set(hypernyms)) | |
else: | |
outstr=("Hyper: Failed!") | |
if (len(set(hyponyms))>0): | |
outstr=outstr+("Hypon:", set(hyponyms)) | |
else: | |
outstr=outstr+("Hypon: Failed!") | |
return outstr | |
def stem(target): | |
ps = PorterStemmer() | |
if (verbose==1): | |
print("[+] Target:",target) | |
print("[+] Output: ", end='') | |
words = SpaceTokenizer().tokenize(target) | |
for w in words: | |
print(ps.stem(w)) | |
print("[+] Complete.") | |
else: | |
words = SpaceTokenizer().tokenize(target) | |
outstr="" | |
for w in words: | |
outstr=outstr+' '+(ps.stem(w)) | |
return outstr.lstrip() | |
def snowball(target): | |
stemmer = SnowballStemmer("english") | |
if (verbose==1): | |
print("[+] Target:",target) | |
words = SpaceTokenizer().tokenize(target) | |
#print("DEBUG:", len(words)) | |
print("[+] Output: ", end='') | |
for w in words: | |
print(stemmer.stem(w)) | |
print("[+] Complete.") | |
else: | |
words = SpaceTokenizer().tokenize(target) | |
outstr="" | |
for w in words: | |
outstr=outstr+' '+(stemmer.stem(w)) | |
return outstr.lstrip() | |
def lemma(target,pos): | |
lemmatizer = WordNetLemmatizer() | |
#print("[+] WordNetLemmatizer imported.") | |
#pos=a,s,v,r | |
if (verbose==1): | |
print("[+] Target:",target) | |
print("[+] Output: ",end='') | |
if (totalargs == 5): | |
print(lemmatizer.lemmatize(target, pos=pos)) | |
else: | |
print(lemmatizer.lemmatize(target)) | |
print("[+] Complete.") | |
else: | |
if (totalargs == 5): | |
return(lemmatizer.lemmatize(target, pos=pos)) | |
else: | |
return(lemmatizer.lemmatize(target)) | |
def postag(target): | |
outstr=pos_tag(word_tok(target)) | |
return simple(target,outstr) | |
def nltk_download(): | |
print("[+] NLTK Downloader launching...\n") | |
nltk.download() | |
print("[+] Complete.") | |
def ner(target): | |
tokenized = custom_sent_tokenizer.tokenize(target) | |
outstr="" | |
if (verbose==1): | |
try: | |
for i in tokenized: | |
words = nltk.word_tokenize(i) | |
tagged = nltk.pos_tag(words) | |
namedEnt = nltk.ne_chunk(tagged, binary=True) | |
print("[+] Target:", target) | |
print("[+] Output:", end='') | |
print(namedEnt) | |
except Exception as e: | |
print(str(e)) | |
print("[+] Complete.") | |
else: | |
try: | |
for i in tokenized: | |
words = nltk.word_tokenize(i) | |
tagged = nltk.pos_tag(words) | |
namedEnt = nltk.ne_chunk(tagged, binary=True) | |
outstr=namedEnt | |
except Exception as e: | |
print(str(e)) | |
return outstr | |
def diag_act(target): | |
def dialogue_act_features(post): | |
features = {} | |
for word in nltk.word_tokenize(post): | |
features['contains({})'.format(word.lower())] = True | |
return features | |
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts] | |
size = int(len(featuresets) * 0.1) | |
train_set, test_set = featuresets[size:], featuresets[:size] | |
classifier = nltk.NaiveBayesClassifier.train(train_set) | |
#print("DEBUG:", nltk.classify.accuracy(classifier, test_set)) | |
if (verbose==1): | |
print("[+] Target:", target) | |
print("[+] Output:", classifier.classify(dialogue_act_features(target))) | |
print("[+] Complete.") | |
else: | |
return classifier.classify(dialogue_act_features(target)) | |
### | |
def main(mode,target,pos): | |
if (totalargs == 2) and ("nltk_download" in mode): | |
if (verbose==1): banner() | |
nltk_download() | |
elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn): | |
banner() | |
print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize)") | |
print("word_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)") | |
print("lemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download") | |
else: | |
if (verbose==1): banner() | |
if ("spc_tok" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
spc_tok(target) | |
else: | |
print(spc_tok(target)) | |
if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three." | |
if (verbose==1): | |
sent_tok(target) | |
else: | |
print(sent_tok(target)) | |
if ("word_tok" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
word_tok(target) | |
else: | |
print(word_tok(target)) | |
if ("syn_ant" in mode): # expects word | |
if (verbose==1): | |
syn_ant(target) | |
else: | |
print(syn_ant(target)) | |
if ("stem" in mode): # expects word | |
if (verbose==1): | |
stem(target) | |
else: | |
print(stem(target)) | |
if ("lemma" in mode): # expects word (tag) | |
if (verbose==1): | |
lemma(target,pos) | |
else: | |
print(lemma(target,pos)) | |
if ("pos_tag" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
postag(target) | |
else: | |
print(postag(target)) | |
if ("ner" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
ner(target) | |
else: | |
print(ner(target)) | |
if ("diag_act" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
diag_act(target) | |
else: | |
print(diag_act(target)) | |
if ("snow" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
snowball(target) | |
else: | |
print(snowball(target)) | |
if ("hyperon" in mode): # expects "a complete sentence." | |
if (verbose==1): | |
hyperon(target) | |
else: | |
print(hyperon(target)) | |
### | |
totalargs = len(sys.argv) | |
script_fn = sys.argv[0] | |
mode="" | |
target="" | |
pos="" | |
verbose=0 | |
for x in range(0,totalargs,1): | |
#print("DEBUG:",sys.argv[x]) | |
if (totalargs >= 6): | |
banner() | |
print("Too many arguments! Check command line. Use \" to wrap series of words.") | |
quit() | |
elif (sys.argv[x] == "-v") or (totalargs==1 and "VTSTech-NLTK" in script_fn): | |
verbose=1 | |
elif (sys.argv[x] == "spc_tok"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "sent_tok"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "word_tok"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "syn_ant"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "stem"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "lemma"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
if (totalargs==5): | |
pos=sys.argv[x+2] | |
elif (sys.argv[x] == "pos_tag"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "ner"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "diag_act"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "snow"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
elif (sys.argv[x] == "hyperon"): | |
mode=sys.argv[x] | |
target=sys.argv[x+1] | |
main(mode,target,pos) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment