Skip to content

Instantly share code, notes, and snippets.

@VTSTech
Last active May 3, 2024 01:17
Show Gist options
  • Save VTSTech/620f80a878acccf9eb64b34193221a67 to your computer and use it in GitHub Desktop.
Save VTSTech/620f80a878acccf9eb64b34193221a67 to your computer and use it in GitHub Desktop.
VTSTech-NLTK Script v0.43
#NLTK Script v0.43 2020-03-14 12:16:31 AM
#Written by VTSTech (veritas@vts-tech.org)
#Various functions inspired by code from sentdex/pythonprogramming.net
#https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
#Various functions inspired by code from Natural Language Processing with Python
#by Steven Bird, Ewan Klein and Edward Loper - http://www.nltk.org/book/ch01.html
import sys, nltk, os, string, random
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import wordnet, state_union, brown
synonyms = []
antonyms = []
hypernyms = []
hyponyms = []
train_text = state_union.raw("1999-Clinton.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
posts = nltk.corpus.nps_chat.xml_posts()[:10000]
def simple(target,outstr):
if (verbose==1):
print("[+] Target:", target)
print("[+] Output:",end='')
print(outstr)
print("[+] Complete.")
else:
return outstr
def banner():
print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]")
print("GitHub: https://gist.github.com/VTSTech/620f80a878acccf9eb64b34193221a67\n")
print("Usage:", script_fn,"-v mode \"word or sentence\"\n")
def spc_tok(target):
outstr=SpaceTokenizer().tokenize(target)
return simple(target,outstr)
def sent_tok(target):
outstr=sent_tokenize(target)
return simple(target,outstr)
def word_tok(target):
outstr=word_tokenize(target)
return simple(target,outstr)
def syn_ant(target):
#print("[+] wordnet imported.")
for syn in wordnet.synsets(target):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
if (verbose==1): print("[+] Target:", target)
temp = wordnet.synsets(target)
if (verbose==1):
if (len(temp) >= 1):
print("[+] Defined:", temp[0].definition())
print("[+] Output:")
if (len(set(synonyms))>0):
print("Syn:", set(synonyms))
else:
print("Syn: Failed to find synonym!")
if (len(set(antonyms))>0):
print("Ant:", set(antonyms))
else:
print("Ant: Failed to find antonym!")
print("[+] Complete.")
else:
if (len(set(synonyms))>0):
outstr="Syn:", set(synonyms)
else:
outstr="Syn: Failed!"
if (len(set(antonyms))>0):
outstr2="Ant:", set(antonyms)
else:
outstr2="Ant: Failed!"
return outstr,outstr2
def hyperon(target):
#print("[+] wordnet imported.")
for syn in wordnet.synsets(target):
for l in syn.hypernyms():
hypernyms.append(l.name())
for l in syn.hyponyms():
hyponyms.append(l.name())
if (verbose==1):
print("[+] Target:", target)
temp = wordnet.synsets(target)
if (len(temp) >= 1): print("[+] Defined:", temp[0].definition())
print("[+] Output:")
if (len(set(hypernyms))>0):
print("Hypernyms:", set(hypernyms))
else:
print("Hypernyms: Failed to find hypernym!")
if (len(set(hyponyms))>0):
print("Hyponyms:", set(hyponyms))
else:
print("Hyponyms: Failed to find hyponyms!")
print("[+] Complete.")
else:
if (len(set(hypernyms))>0):
outstr=("Hyper:", set(hypernyms))
else:
outstr=("Hyper: Failed!")
if (len(set(hyponyms))>0):
outstr=outstr+("Hypon:", set(hyponyms))
else:
outstr=outstr+("Hypon: Failed!")
return outstr
def stem(target):
ps = PorterStemmer()
if (verbose==1):
print("[+] Target:",target)
print("[+] Output: ", end='')
words = SpaceTokenizer().tokenize(target)
for w in words:
print(ps.stem(w))
print("[+] Complete.")
else:
words = SpaceTokenizer().tokenize(target)
outstr=""
for w in words:
outstr=outstr+' '+(ps.stem(w))
return outstr.lstrip()
def snowball(target):
stemmer = SnowballStemmer("english")
if (verbose==1):
print("[+] Target:",target)
words = SpaceTokenizer().tokenize(target)
#print("DEBUG:", len(words))
print("[+] Output: ", end='')
for w in words:
print(stemmer.stem(w))
print("[+] Complete.")
else:
words = SpaceTokenizer().tokenize(target)
outstr=""
for w in words:
outstr=outstr+' '+(stemmer.stem(w))
return outstr.lstrip()
def lemma(target,pos):
lemmatizer = WordNetLemmatizer()
#print("[+] WordNetLemmatizer imported.")
#pos=a,s,v,r
if (verbose==1):
print("[+] Target:",target)
print("[+] Output: ",end='')
if (totalargs == 5):
print(lemmatizer.lemmatize(target, pos=pos))
else:
print(lemmatizer.lemmatize(target))
print("[+] Complete.")
else:
if (totalargs == 5):
return(lemmatizer.lemmatize(target, pos=pos))
else:
return(lemmatizer.lemmatize(target))
def postag(target):
outstr=pos_tag(word_tok(target))
return simple(target,outstr)
def nltk_download():
print("[+] NLTK Downloader launching...\n")
nltk.download()
print("[+] Complete.")
def ner(target):
tokenized = custom_sent_tokenizer.tokenize(target)
outstr=""
if (verbose==1):
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
print("[+] Target:", target)
print("[+] Output:", end='')
print(namedEnt)
except Exception as e:
print(str(e))
print("[+] Complete.")
else:
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
outstr=namedEnt
except Exception as e:
print(str(e))
return outstr
def diag_act(target):
def dialogue_act_features(post):
features = {}
for word in nltk.word_tokenize(post):
features['contains({})'.format(word.lower())] = True
return features
featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
#print("DEBUG:", nltk.classify.accuracy(classifier, test_set))
if (verbose==1):
print("[+] Target:", target)
print("[+] Output:", classifier.classify(dialogue_act_features(target)))
print("[+] Complete.")
else:
return classifier.classify(dialogue_act_features(target))
###
def main(mode,target,pos):
if (totalargs == 2) and ("nltk_download" in mode):
if (verbose==1): banner()
nltk_download()
elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn):
banner()
print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize)")
print("word_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)")
print("lemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
else:
if (verbose==1): banner()
if ("spc_tok" in mode): # expects "a complete sentence."
if (verbose==1):
spc_tok(target)
else:
print(spc_tok(target))
if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three."
if (verbose==1):
sent_tok(target)
else:
print(sent_tok(target))
if ("word_tok" in mode): # expects "a complete sentence."
if (verbose==1):
word_tok(target)
else:
print(word_tok(target))
if ("syn_ant" in mode): # expects word
if (verbose==1):
syn_ant(target)
else:
print(syn_ant(target))
if ("stem" in mode): # expects word
if (verbose==1):
stem(target)
else:
print(stem(target))
if ("lemma" in mode): # expects word (tag)
if (verbose==1):
lemma(target,pos)
else:
print(lemma(target,pos))
if ("pos_tag" in mode): # expects "a complete sentence."
if (verbose==1):
postag(target)
else:
print(postag(target))
if ("ner" in mode): # expects "a complete sentence."
if (verbose==1):
ner(target)
else:
print(ner(target))
if ("diag_act" in mode): # expects "a complete sentence."
if (verbose==1):
diag_act(target)
else:
print(diag_act(target))
if ("snow" in mode): # expects "a complete sentence."
if (verbose==1):
snowball(target)
else:
print(snowball(target))
if ("hyperon" in mode): # expects "a complete sentence."
if (verbose==1):
hyperon(target)
else:
print(hyperon(target))
###
totalargs = len(sys.argv)
script_fn = sys.argv[0]
mode=""
target=""
pos=""
verbose=0
for x in range(0,totalargs,1):
#print("DEBUG:",sys.argv[x])
if (totalargs >= 6):
banner()
print("Too many arguments! Check command line. Use \" to wrap series of words.")
quit()
elif (sys.argv[x] == "-v") or (totalargs==1 and "VTSTech-NLTK" in script_fn):
verbose=1
elif (sys.argv[x] == "spc_tok"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "sent_tok"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "word_tok"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "syn_ant"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "stem"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "lemma"):
mode=sys.argv[x]
target=sys.argv[x+1]
if (totalargs==5):
pos=sys.argv[x+2]
elif (sys.argv[x] == "pos_tag"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "ner"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "diag_act"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "snow"):
mode=sys.argv[x]
target=sys.argv[x+1]
elif (sys.argv[x] == "hyperon"):
mode=sys.argv[x]
target=sys.argv[x+1]
main(mode,target,pos)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment