Skip to content

Instantly share code, notes, and snippets.

@VTSTech
Last active May 3, 2024 01:17
VTSTech-NLTK Script v0.43
#NLTK Script v0.2 2020-03-11 9:25:03 PM
#Written by VTSTech (veritas@vts-tech.org)
#Various functions inspired by code from sentdex/pythonprogramming.net
#https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
import sys, nltk, os, string
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet, state_union
synonyms = []
antonyms = []
train_text = state_union.raw("1999-Clinton.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
def banner():
print("NLTK Script v0.2 03-11-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
def spc_tok():
#print("[+] SpaceTokenizer imported.")
print("[+] Target:", sys.argv[2])
print("[+] Output:",end='')
print(SpaceTokenizer().tokenize(sys.argv[2]))
print("[+] Complete.")
def syn_ant():
#print("[+] wordnet imported.")
for syn in wordnet.synsets(sys.argv[2]):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
print("[+] Target:", sys.argv[2])
target = wordnet.synsets(sys.argv[2])
if (len(target) >= 1): print("[+] Defined:", target[0].definition())
print("[+] Output:")
if (len(set(synonyms))>0):
print("Syn:", set(synonyms))
else:
print("Syn: Failed to find synonym!")
if (len(set(antonyms))>0):
print("Ant:", set(antonyms))
else:
print("Ant: Failed to find antonym!")
print("[+] Complete.")
def stem():
ps = PorterStemmer()
#print("[+] PorterStemmer imported.")
print("[+] Target:",sys.argv[2])
print("[+] Output: ", end='')
words = SpaceTokenizer().tokenize(sys.argv[2])
for w in words:
print(ps.stem(w))
print("[+] Complete.")
def lemma():
lemmatizer = WordNetLemmatizer()
#print("[+] WordNetLemmatizer imported.")
#pos=a,s,v,r
print("[+] Target:",sys.argv[2])
print("[+] Output: ",end='')
if (len(sys.argv) == 4):
#print("DEBUG:", sys.argv[3])
print(lemmatizer.lemmatize(sys.argv[2], pos=sys.argv[3]))
else:
print(lemmatizer.lemmatize(sys.argv[2]))
print("[+] Complete.")
def postag():
#print("[+] pos_tag, SpaceTokenizer imported.")
print("[+] Target:", sys.argv[2])
print("[+] Output:", end='')
print(pos_tag(SpaceTokenizer().tokenize(sys.argv[2])))
print("[+] Complete.")
def nltk_download():
print("[+] NLTK Downloader launching...\n")
nltk.download()
print("[+] Complete.")
def ner():
tokenized = custom_sent_tokenizer.tokenize(sys.argv[2])
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=True)
print("[+] Target:", sys.argv[2])
print("[+] Output:", end='')
print(namedEnt)
except Exception as e:
print(str(e))
print("[+] Complete.")
###
def main():
if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]):
banner()
nltk_download()
elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]):
banner()
print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download")
else:
banner()
if ("spc_tok" in sys.argv[1]): # expects "a complete sentence."
spc_tok()
if ("syn_ant" in sys.argv[1]): # expects word
syn_ant()
if ("stem" in sys.argv[1]): # expects word
stem()
if ("lemma" in sys.argv[1]): # expects word
lemma()
if ("pos_tag" in sys.argv[1]): # expects "a complete sentence."
postag()
if ("ner" in sys.argv[1]): # expects "a complete sentence."
ner()
###
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment