Last active
May 3, 2024 01:17
VTSTech-NLTK Script v0.43
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#NLTK Script v0.2 2020-03-11 9:25:03 PM | |
#Written by VTSTech (veritas@vts-tech.org) | |
#Various functions inspired by code from sentdex/pythonprogramming.net | |
#https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/ | |
import sys, nltk, os, string | |
from nltk import pos_tag | |
from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer | |
from nltk.stem import PorterStemmer, WordNetLemmatizer | |
from nltk.corpus import wordnet, state_union | |
synonyms = [] | |
antonyms = [] | |
train_text = state_union.raw("1999-Clinton.txt") | |
custom_sent_tokenizer = PunktSentenceTokenizer(train_text) | |
def banner(): | |
print("NLTK Script v0.2 03-11-2020\nWritten by VTSTech (veritas@vts-tech.org)\n") | |
print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n") | |
def spc_tok(): | |
#print("[+] SpaceTokenizer imported.") | |
print("[+] Target:", sys.argv[2]) | |
print("[+] Output:",end='') | |
print(SpaceTokenizer().tokenize(sys.argv[2])) | |
print("[+] Complete.") | |
def syn_ant(): | |
#print("[+] wordnet imported.") | |
for syn in wordnet.synsets(sys.argv[2]): | |
for l in syn.lemmas(): | |
synonyms.append(l.name()) | |
if l.antonyms(): | |
antonyms.append(l.antonyms()[0].name()) | |
print("[+] Target:", sys.argv[2]) | |
target = wordnet.synsets(sys.argv[2]) | |
if (len(target) >= 1): print("[+] Defined:", target[0].definition()) | |
print("[+] Output:") | |
if (len(set(synonyms))>0): | |
print("Syn:", set(synonyms)) | |
else: | |
print("Syn: Failed to find synonym!") | |
if (len(set(antonyms))>0): | |
print("Ant:", set(antonyms)) | |
else: | |
print("Ant: Failed to find antonym!") | |
print("[+] Complete.") | |
def stem(): | |
ps = PorterStemmer() | |
#print("[+] PorterStemmer imported.") | |
print("[+] Target:",sys.argv[2]) | |
print("[+] Output: ", end='') | |
words = SpaceTokenizer().tokenize(sys.argv[2]) | |
for w in words: | |
print(ps.stem(w)) | |
print("[+] Complete.") | |
def lemma(): | |
lemmatizer = WordNetLemmatizer() | |
#print("[+] WordNetLemmatizer imported.") | |
#pos=a,s,v,r | |
print("[+] Target:",sys.argv[2]) | |
print("[+] Output: ",end='') | |
if (len(sys.argv) == 4): | |
#print("DEBUG:", sys.argv[3]) | |
print(lemmatizer.lemmatize(sys.argv[2], pos=sys.argv[3])) | |
else: | |
print(lemmatizer.lemmatize(sys.argv[2])) | |
print("[+] Complete.") | |
def postag(): | |
#print("[+] pos_tag, SpaceTokenizer imported.") | |
print("[+] Target:", sys.argv[2]) | |
print("[+] Output:", end='') | |
print(pos_tag(SpaceTokenizer().tokenize(sys.argv[2]))) | |
print("[+] Complete.") | |
def nltk_download(): | |
print("[+] NLTK Downloader launching...\n") | |
nltk.download() | |
print("[+] Complete.") | |
def ner(): | |
tokenized = custom_sent_tokenizer.tokenize(sys.argv[2]) | |
try: | |
for i in tokenized: | |
words = nltk.word_tokenize(i) | |
tagged = nltk.pos_tag(words) | |
namedEnt = nltk.ne_chunk(tagged, binary=True) | |
print("[+] Target:", sys.argv[2]) | |
print("[+] Output:", end='') | |
print(namedEnt) | |
except Exception as e: | |
print(str(e)) | |
print("[+] Complete.") | |
### | |
def main(): | |
if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]): | |
banner() | |
nltk_download() | |
elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]): | |
banner() | |
print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download") | |
else: | |
banner() | |
if ("spc_tok" in sys.argv[1]): # expects "a complete sentence." | |
spc_tok() | |
if ("syn_ant" in sys.argv[1]): # expects word | |
syn_ant() | |
if ("stem" in sys.argv[1]): # expects word | |
stem() | |
if ("lemma" in sys.argv[1]): # expects word | |
lemma() | |
if ("pos_tag" in sys.argv[1]): # expects "a complete sentence." | |
postag() | |
if ("ner" in sys.argv[1]): # expects "a complete sentence." | |
ner() | |
### | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment