Skip to content

Instantly share code, notes, and snippets.

@VTSTech
Last active May 3, 2024 01:17

Revisions

  1. VTSTech revised this gist May 3, 2024. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -30,7 +30,7 @@ def simple(target,outstr):
    return outstr
    def banner():
    print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]")
    print("GitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("GitHub: https://gist.github.com/VTSTech/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"-v mode \"word or sentence\"\n")
    def spc_tok(target):
    outstr=SpaceTokenizer().tokenize(target)
  2. VTSTech revised this gist Mar 14, 2020. 1 changed file with 20 additions and 30 deletions.
    50 changes: 20 additions & 30 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    #NLTK Script v0.42 2020-03-13 10:08:47 PM
    #NLTK Script v0.43 2020-03-14 12:16:31 AM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
    @@ -20,34 +20,27 @@
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    posts = nltk.corpus.nps_chat.xml_posts()[:10000]

    def banner():
    print("NLTK Script v0.42 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"-v mode \"word or sentence\"\n")
    def spc_tok(target):
    #print("[+] SpaceTokenizer imported.")
    def simple(target,outstr):
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(SpaceTokenizer().tokenize(target))
    print(outstr)
    print("[+] Complete.")
    else:
    return SpaceTokenizer().tokenize(target)
    return outstr
    def banner():
    print("NLTK Script v0.43 Written by VTSTech (veritas@vts-tech.org) [03-13-2020]")
    print("GitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"-v mode \"word or sentence\"\n")
    def spc_tok(target):
    outstr=SpaceTokenizer().tokenize(target)
    return simple(target,outstr)
    def sent_tok(target):
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(sent_tokenize(target))
    print("[+] Complete.")
    else:
    return sent_tokenize(target)
    outstr=sent_tokenize(target)
    return simple(target,outstr)
    def word_tok(target):
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(word_tokenize(target))
    print("[+] Complete.")
    else:
    return word_tokenize(target)
    outstr=word_tokenize(target)
    return simple(target,outstr)
    def syn_ant(target):
    #print("[+] wordnet imported.")
    for syn in wordnet.synsets(target):
    @@ -160,13 +153,8 @@ def lemma(target,pos):
    else:
    return(lemmatizer.lemmatize(target))
    def postag(target):
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(pos_tag(SpaceTokenizer().tokenize(target)))
    print("[+] Complete.")
    else:
    return(pos_tag(SpaceTokenizer().tokenize(target)))
    outstr=pos_tag(word_tok(target))
    return simple(target,outstr)
    def nltk_download():
    print("[+] NLTK Downloader launching...\n")
    nltk.download()
    @@ -220,7 +208,9 @@ def main(mode,target,pos):
    nltk_download()
    elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn):
    banner()
    print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize)")
    print("word_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)")
    print("lemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    else:
    if (verbose==1): banner()
    if ("spc_tok" in mode): # expects "a complete sentence."
  3. VTSTech revised this gist Mar 14, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -291,7 +291,7 @@ def main(mode,target,pos):
    banner()
    print("Too many arguments! Check command line. Use \" to wrap series of words.")
    quit()
    elif (sys.argv[x] == "-v") or (totalargs==1):
    elif (sys.argv[x] == "-v") or (totalargs==1 and "VTSTech-NLTK" in script_fn):
    verbose=1
    elif (sys.argv[x] == "spc_tok"):
    mode=sys.argv[x]
  4. VTSTech revised this gist Mar 14, 2020. 1 changed file with 248 additions and 94 deletions.
    342 changes: 248 additions & 94 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,7 +1,9 @@
    #NLTK Script v0.41 2020-03-13 1:17:23 AM
    #NLTK Script v0.42 2020-03-13 10:08:47 PM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
    #Various functions inspired by code from Natural Language Processing with Python
    #by Steven Bird, Ewan Klein and Edward Loper - http://www.nltk.org/book/ch01.html

    import sys, nltk, os, string, random
    from nltk import pos_tag
    @@ -12,101 +14,188 @@

    synonyms = []
    antonyms = []
    hypernyms = []
    hyponyms = []
    train_text = state_union.raw("1999-Clinton.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    posts = nltk.corpus.nps_chat.xml_posts()[:10000]

    def banner():
    print("NLTK Script v0.41 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"mode \"word or sentence\"\n")
    print("NLTK Script v0.42 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"-v mode \"word or sentence\"\n")
    def spc_tok(target):
    #print("[+] SpaceTokenizer imported.")
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(SpaceTokenizer().tokenize(target))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(SpaceTokenizer().tokenize(target))
    print("[+] Complete.")
    else:
    return SpaceTokenizer().tokenize(target)
    def sent_tok(target):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(sent_tokenize(target))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(sent_tokenize(target))
    print("[+] Complete.")
    else:
    return sent_tokenize(target)
    def word_tok(target):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(word_tokenize(target))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(word_tokenize(target))
    print("[+] Complete.")
    else:
    return word_tokenize(target)
    def syn_ant(target):
    #print("[+] wordnet imported.")
    for syn in wordnet.synsets(target):
    for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
    antonyms.append(l.antonyms()[0].name())
    print("[+] Target:", target)
    if (verbose==1): print("[+] Target:", target)
    temp = wordnet.synsets(target)
    if (len(temp) >= 1): print("[+] Defined:", temp[0].definition())
    print("[+] Output:")
    if (len(set(synonyms))>0):
    print("Syn:", set(synonyms))
    if (verbose==1):
    if (len(temp) >= 1):
    print("[+] Defined:", temp[0].definition())
    print("[+] Output:")
    if (len(set(synonyms))>0):
    print("Syn:", set(synonyms))
    else:
    print("Syn: Failed to find synonym!")
    if (len(set(antonyms))>0):
    print("Ant:", set(antonyms))
    else:
    print("Ant: Failed to find antonym!")
    print("[+] Complete.")
    else:
    print("Syn: Failed to find synonym!")
    if (len(set(antonyms))>0):
    print("Ant:", set(antonyms))
    if (len(set(synonyms))>0):
    outstr="Syn:", set(synonyms)
    else:
    outstr="Syn: Failed!"
    if (len(set(antonyms))>0):
    outstr2="Ant:", set(antonyms)
    else:
    outstr2="Ant: Failed!"
    return outstr,outstr2
    def hyperon(target):
    #print("[+] wordnet imported.")
    for syn in wordnet.synsets(target):
    for l in syn.hypernyms():
    hypernyms.append(l.name())
    for l in syn.hyponyms():
    hyponyms.append(l.name())
    if (verbose==1):
    print("[+] Target:", target)
    temp = wordnet.synsets(target)
    if (len(temp) >= 1): print("[+] Defined:", temp[0].definition())
    print("[+] Output:")
    if (len(set(hypernyms))>0):
    print("Hypernyms:", set(hypernyms))
    else:
    print("Hypernyms: Failed to find hypernym!")
    if (len(set(hyponyms))>0):
    print("Hyponyms:", set(hyponyms))
    else:
    print("Hyponyms: Failed to find hyponyms!")
    print("[+] Complete.")
    else:
    print("Ant: Failed to find antonym!")
    print("[+] Complete.")
    if (len(set(hypernyms))>0):
    outstr=("Hyper:", set(hypernyms))
    else:
    outstr=("Hyper: Failed!")
    if (len(set(hyponyms))>0):
    outstr=outstr+("Hypon:", set(hyponyms))
    else:
    outstr=outstr+("Hypon: Failed!")
    return outstr
    def stem(target):
    ps = PorterStemmer()
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",target)
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(target)
    for w in words:
    print(ps.stem(w))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:",target)
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(target)
    for w in words:
    print(ps.stem(w))
    print("[+] Complete.")
    else:
    words = SpaceTokenizer().tokenize(target)
    outstr=""
    for w in words:
    outstr=outstr+' '+(ps.stem(w))
    return outstr.lstrip()
    def snowball(target):
    stemmer = SnowballStemmer("english")
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",target)
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(target)
    for w in words:
    print(stemmer.stem(w))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:",target)
    words = SpaceTokenizer().tokenize(target)
    #print("DEBUG:", len(words))
    print("[+] Output: ", end='')
    for w in words:
    print(stemmer.stem(w))
    print("[+] Complete.")
    else:
    words = SpaceTokenizer().tokenize(target)
    outstr=""
    for w in words:
    outstr=outstr+' '+(stemmer.stem(w))
    return outstr.lstrip()
    def lemma(target,pos):
    lemmatizer = WordNetLemmatizer()
    #print("[+] WordNetLemmatizer imported.")
    #pos=a,s,v,r
    print("[+] Target:",target)
    print("[+] Output: ",end='')
    if (totalargs == 4):
    print(lemmatizer.lemmatize(target, pos=pos))
    if (verbose==1):
    print("[+] Target:",target)
    print("[+] Output: ",end='')
    if (totalargs == 5):
    print(lemmatizer.lemmatize(target, pos=pos))
    else:
    print(lemmatizer.lemmatize(target))
    print("[+] Complete.")
    else:
    print(lemmatizer.lemmatize(target))
    print("[+] Complete.")
    if (totalargs == 5):
    return(lemmatizer.lemmatize(target, pos=pos))
    else:
    return(lemmatizer.lemmatize(target))
    def postag(target):
    #print("[+] pos_tag, SpaceTokenizer imported.")
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(pos_tag(SpaceTokenizer().tokenize(target)))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(pos_tag(SpaceTokenizer().tokenize(target)))
    print("[+] Complete.")
    else:
    return(pos_tag(SpaceTokenizer().tokenize(target)))
    def nltk_download():
    print("[+] NLTK Downloader launching...\n")
    nltk.download()
    print("[+] Complete.")
    def ner(target):
    tokenized = custom_sent_tokenizer.tokenize(target)
    try:
    for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(namedEnt)
    except Exception as e:
    print(str(e))
    print("[+] Complete.")
    outstr=""
    if (verbose==1):
    try:
    for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(namedEnt)
    except Exception as e:
    print(str(e))
    print("[+] Complete.")
    else:
    try:
    for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    outstr=namedEnt
    except Exception as e:
    print(str(e))
    return outstr
    def diag_act(target):
    def dialogue_act_features(post):
    features = {}
    @@ -118,60 +207,125 @@ def dialogue_act_features(post):
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #print("DEBUG:", nltk.classify.accuracy(classifier, test_set))
    print("[+] Target:", target)
    print("[+] Output:", classifier.classify(dialogue_act_features(target)))
    print("[+] Complete.")
    if (verbose==1):
    print("[+] Target:", target)
    print("[+] Output:", classifier.classify(dialogue_act_features(target)))
    print("[+] Complete.")
    else:
    return classifier.classify(dialogue_act_features(target))
    ###
    def main(mode,target,pos):
    if (totalargs == 2) and ("nltk_download" in mode):
    banner()
    if (verbose==1): banner()
    nltk_download()
    elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn):
    banner()
    print("Modes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    print("-v verbose output (shows banner, target, definitions, etc)\n\nModes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nhyperon (hyper/hyponyms)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    else:
    banner()
    if (verbose==1): banner()
    if ("spc_tok" in mode): # expects "a complete sentence."
    spc_tok(target)
    if (verbose==1):
    spc_tok(target)
    else:
    print(spc_tok(target))
    if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three."
    sent_tok(target)
    if (verbose==1):
    sent_tok(target)
    else:
    print(sent_tok(target))
    if ("word_tok" in mode): # expects "a complete sentence."
    word_tok(target)
    if (verbose==1):
    word_tok(target)
    else:
    print(word_tok(target))
    if ("syn_ant" in mode): # expects word
    syn_ant(target)
    if (verbose==1):
    syn_ant(target)
    else:
    print(syn_ant(target))
    if ("stem" in mode): # expects word
    stem(target)
    if (verbose==1):
    stem(target)
    else:
    print(stem(target))
    if ("lemma" in mode): # expects word (tag)
    if (verbose==1):
    lemma(target,pos)
    else:
    print(lemma(target,pos))
    if ("pos_tag" in mode): # expects "a complete sentence."
    postag(target)
    if (verbose==1):
    postag(target)
    else:
    print(postag(target))
    if ("ner" in mode): # expects "a complete sentence."
    ner(target)
    if (verbose==1):
    ner(target)
    else:
    print(ner(target))
    if ("diag_act" in mode): # expects "a complete sentence."
    diag_act(target)
    if (verbose==1):
    diag_act(target)
    else:
    print(diag_act(target))
    if ("snow" in mode): # expects "a complete sentence."
    snowball(target)
    if (verbose==1):
    snowball(target)
    else:
    print(snowball(target))
    if ("hyperon" in mode): # expects "a complete sentence."
    if (verbose==1):
    hyperon(target)
    else:
    print(hyperon(target))
    ###
    totalargs = len(sys.argv)
    script_fn = sys.argv[0]
    mode=""
    target=""
    pos=""
    if (totalargs >= 4):
    mode = sys.argv[1]
    target = sys.argv[2]
    pos = sys.argv[3]
    main(mode,target,pos)
    elif (totalargs == 3):
    mode = sys.argv[1]
    target = sys.argv[2]
    main(mode,target,pos)
    elif (totalargs == 2):
    mode = sys.argv[1]
    main(mode,target,pos)
    elif (totalargs == 1):
    main(mode,target,pos)
    else:
    banner()
    print("Too many arguments! Check command line. Use \" to wrap sentences.")
    quit()
    verbose=0
    for x in range(0,totalargs,1):
    #print("DEBUG:",sys.argv[x])
    if (totalargs >= 6):
    banner()
    print("Too many arguments! Check command line. Use \" to wrap series of words.")
    quit()
    elif (sys.argv[x] == "-v") or (totalargs==1):
    verbose=1
    elif (sys.argv[x] == "spc_tok"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "sent_tok"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "word_tok"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "syn_ant"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "stem"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "lemma"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    if (totalargs==5):
    pos=sys.argv[x+2]
    elif (sys.argv[x] == "pos_tag"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "ner"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "diag_act"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "snow"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    elif (sys.argv[x] == "hyperon"):
    mode=sys.argv[x]
    target=sys.argv[x+1]
    main(mode,target,pos)
  5. VTSTech revised this gist Mar 13, 2020. 1 changed file with 88 additions and 53 deletions.
    141 changes: 88 additions & 53 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    #NLTK Script v0.4 2020-03-12 3:12:58 PM
    #NLTK Script v0.41 2020-03-13 1:17:23 AM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
    @@ -17,24 +17,34 @@
    posts = nltk.corpus.nps_chat.xml_posts()[:10000]

    def banner():
    print("NLTK Script v0.4 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    print("NLTK Script v0.41 03-13-2020\nWritten by VTSTech (veritas@vts-tech.org)\nGitHub: https://gist.github.com/Veritas83/620f80a878acccf9eb64b34193221a67\n")
    print("Usage:", script_fn,"mode \"word or sentence\"\n")
    def spc_tok(target):
    #print("[+] SpaceTokenizer imported.")
    print("[+] Target:", sys.argv[2])
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(SpaceTokenizer().tokenize(sys.argv[2]))
    print(SpaceTokenizer().tokenize(target))
    print("[+] Complete.")
    def syn_ant():
    def sent_tok(target):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(sent_tokenize(target))
    print("[+] Complete.")
    def word_tok(target):
    print("[+] Target:", target)
    print("[+] Output:",end='')
    print(word_tokenize(target))
    print("[+] Complete.")
    def syn_ant(target):
    #print("[+] wordnet imported.")
    for syn in wordnet.synsets(sys.argv[2]):
    for syn in wordnet.synsets(target):
    for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
    antonyms.append(l.antonyms()[0].name())
    print("[+] Target:", sys.argv[2])
    target = wordnet.synsets(sys.argv[2])
    if (len(target) >= 1): print("[+] Defined:", target[0].definition())
    print("[+] Target:", target)
    temp = wordnet.synsets(target)
    if (len(temp) >= 1): print("[+] Defined:", temp[0].definition())
    print("[+] Output:")
    if (len(set(synonyms))>0):
    print("Syn:", set(synonyms))
    @@ -45,60 +55,59 @@ def syn_ant():
    else:
    print("Ant: Failed to find antonym!")
    print("[+] Complete.")
    def stem():
    def stem(target):
    ps = PorterStemmer()
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",sys.argv[2])
    print("[+] Target:",target)
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(sys.argv[2])
    words = SpaceTokenizer().tokenize(target)
    for w in words:
    print(ps.stem(w))
    print("[+] Complete.")
    def snowball():
    def snowball(target):
    stemmer = SnowballStemmer("english")
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",sys.argv[2])
    print("[+] Target:",target)
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(sys.argv[2])
    words = SpaceTokenizer().tokenize(target)
    for w in words:
    print(stemmer.stem(w))
    print("[+] Complete.")
    def lemma():
    def lemma(target,pos):
    lemmatizer = WordNetLemmatizer()
    #print("[+] WordNetLemmatizer imported.")
    #pos=a,s,v,r
    print("[+] Target:",sys.argv[2])
    print("[+] Target:",target)
    print("[+] Output: ",end='')
    if (len(sys.argv) == 4):
    #print("DEBUG:", sys.argv[3])
    print(lemmatizer.lemmatize(sys.argv[2], pos=sys.argv[3]))
    if (totalargs == 4):
    print(lemmatizer.lemmatize(target, pos=pos))
    else:
    print(lemmatizer.lemmatize(sys.argv[2]))
    print(lemmatizer.lemmatize(target))
    print("[+] Complete.")
    def postag():
    def postag(target):
    #print("[+] pos_tag, SpaceTokenizer imported.")
    print("[+] Target:", sys.argv[2])
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(pos_tag(SpaceTokenizer().tokenize(sys.argv[2])))
    print(pos_tag(SpaceTokenizer().tokenize(target)))
    print("[+] Complete.")
    def nltk_download():
    print("[+] NLTK Downloader launching...\n")
    nltk.download()
    print("[+] Complete.")
    def ner():
    tokenized = custom_sent_tokenizer.tokenize(sys.argv[2])
    def ner(target):
    tokenized = custom_sent_tokenizer.tokenize(target)
    try:
    for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    print("[+] Target:", sys.argv[2])
    print("[+] Target:", target)
    print("[+] Output:", end='')
    print(namedEnt)
    except Exception as e:
    print(str(e))
    print("[+] Complete.")
    def diag_act():
    def diag_act(target):
    def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
    @@ -109,34 +118,60 @@ def dialogue_act_features(post):
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #print("DEBUG:", nltk.classify.accuracy(classifier, test_set))
    print("[+] Target:", sys.argv[2])
    print("[+] Output:", classifier.classify(dialogue_act_features(sys.argv[2])))
    print("[+] Target:", target)
    print("[+] Output:", classifier.classify(dialogue_act_features(target)))
    print("[+] Complete.")
    ###
    def main():
    if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]):
    def main(mode,target,pos):
    if (totalargs == 2) and ("nltk_download" in mode):
    banner()
    nltk_download()
    elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]):
    elif (totalargs <= 2) and ("VTSTech-NLTK.py" in script_fn):
    banner()
    print("Modes:\n\nspc_tok (SpaceTokenizer) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    print("Modes:\n\nspc_tok (SpaceTokenizer) \nsent_tok (sent_tokenize) \nword_tok (word_tokenize) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    else:
    banner()
    if ("spc_tok" in sys.argv[1]): # expects "a complete sentence."
    spc_tok()
    if ("syn_ant" in sys.argv[1]): # expects word
    syn_ant()
    if ("stem" in sys.argv[1]): # expects word
    stem()
    if ("lemma" in sys.argv[1]): # expects word (tag)
    lemma()
    if ("pos_tag" in sys.argv[1]): # expects "a complete sentence."
    postag()
    if ("ner" in sys.argv[1]): # expects "a complete sentence."
    ner()
    if ("diag_act" in sys.argv[1]): # expects "a complete sentence."
    diag_act()
    if ("snow" in sys.argv[1]): # expects "a complete sentence."
    snowball()
    if ("spc_tok" in mode): # expects "a complete sentence."
    spc_tok(target)
    if ("sent_tok" in mode): # expects "a complete sentence. Or two. Or three."
    sent_tok(target)
    if ("word_tok" in mode): # expects "a complete sentence."
    word_tok(target)
    if ("syn_ant" in mode): # expects word
    syn_ant(target)
    if ("stem" in mode): # expects word
    stem(target)
    if ("lemma" in mode): # expects word (tag)
    lemma(target,pos)
    if ("pos_tag" in mode): # expects "a complete sentence."
    postag(target)
    if ("ner" in mode): # expects "a complete sentence."
    ner(target)
    if ("diag_act" in mode): # expects "a complete sentence."
    diag_act(target)
    if ("snow" in mode): # expects "a complete sentence."
    snowball(target)
    ###
    main()
    totalargs = len(sys.argv)
    script_fn = sys.argv[0]
    mode=""
    target=""
    pos=""
    if (totalargs >= 4):
    mode = sys.argv[1]
    target = sys.argv[2]
    pos = sys.argv[3]
    main(mode,target,pos)
    elif (totalargs == 3):
    mode = sys.argv[1]
    target = sys.argv[2]
    main(mode,target,pos)
    elif (totalargs == 2):
    mode = sys.argv[1]
    main(mode,target,pos)
    elif (totalargs == 1):
    main(mode,target,pos)
    else:
    banner()
    print("Too many arguments! Check command line. Use \" to wrap sentences.")
    quit()
  6. VTSTech revised this gist Mar 12, 2020. No changes.
  7. VTSTech revised this gist Mar 12, 2020. 1 changed file with 16 additions and 4 deletions.
    20 changes: 16 additions & 4 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    #NLTK Script v0.3 2020-03-12 12:32:43 AM
    #NLTK Script v0.4 2020-03-12 3:12:58 PM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
    @@ -7,6 +7,7 @@
    from nltk import pos_tag
    from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.stem.snowball import SnowballStemmer
    from nltk.corpus import wordnet, state_union, brown

    synonyms = []
    @@ -16,7 +17,7 @@
    posts = nltk.corpus.nps_chat.xml_posts()[:10000]

    def banner():
    print("NLTK Script v0.3 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("NLTK Script v0.4 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    #print("[+] SpaceTokenizer imported.")
    @@ -53,6 +54,15 @@ def stem():
    for w in words:
    print(ps.stem(w))
    print("[+] Complete.")
    def snowball():
    stemmer = SnowballStemmer("english")
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",sys.argv[2])
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(sys.argv[2])
    for w in words:
    print(stemmer.stem(w))
    print("[+] Complete.")
    def lemma():
    lemmatizer = WordNetLemmatizer()
    #print("[+] WordNetLemmatizer imported.")
    @@ -109,7 +119,7 @@ def main():
    nltk_download()
    elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]):
    banner()
    print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\ndiag_act\nnltk_download")
    print("Modes:\n\nspc_tok (SpaceTokenizer) \nsyn_ant (wordnet.synsets)\nstem (PorterStemmer)\nsnow (SnowballStemmer)\nlemma (WordNetLemmatizer)\npos_tag (Part-of-Speech)\nner (Named Entity Recognition)\ndiag_act (Dialogue Action)\nnltk_download")
    else:
    banner()
    if ("spc_tok" in sys.argv[1]): # expects "a complete sentence."
    @@ -118,13 +128,15 @@ def main():
    syn_ant()
    if ("stem" in sys.argv[1]): # expects word
    stem()
    if ("lemma" in sys.argv[1]): # expects word
    if ("lemma" in sys.argv[1]): # expects word (tag)
    lemma()
    if ("pos_tag" in sys.argv[1]): # expects "a complete sentence."
    postag()
    if ("ner" in sys.argv[1]): # expects "a complete sentence."
    ner()
    if ("diag_act" in sys.argv[1]): # expects "a complete sentence."
    diag_act()
    if ("snow" in sys.argv[1]): # expects "a complete sentence."
    snowball()
    ###
    main()
  8. VTSTech revised this gist Mar 12, 2020. 1 changed file with 22 additions and 5 deletions.
    27 changes: 22 additions & 5 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,21 +1,22 @@
    #NLTK Script v0.2 2020-03-11 9:25:03 PM
    #NLTK Script v0.3 2020-03-12 12:32:43 AM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

    import sys, nltk, os, string
    import sys, nltk, os, string, random
    from nltk import pos_tag
    from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.corpus import wordnet, state_union
    from nltk.corpus import wordnet, state_union, brown

    synonyms = []
    antonyms = []
    train_text = state_union.raw("1999-Clinton.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    posts = nltk.corpus.nps_chat.xml_posts()[:10000]

    def banner():
    print("NLTK Script v0.2 03-11-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("NLTK Script v0.3 03-12-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    #print("[+] SpaceTokenizer imported.")
    @@ -87,14 +88,28 @@ def ner():
    except Exception as e:
    print(str(e))
    print("[+] Complete.")
    def diag_act():
    def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
    features['contains({})'.format(word.lower())] = True
    return features
    featuresets = [(dialogue_act_features(post.text), post.get('class')) for post in posts]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
    classifier = nltk.NaiveBayesClassifier.train(train_set)
    #print("DEBUG:", nltk.classify.accuracy(classifier, test_set))
    print("[+] Target:", sys.argv[2])
    print("[+] Output:", classifier.classify(dialogue_act_features(sys.argv[2])))
    print("[+] Complete.")
    ###
    def main():
    if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]):
    banner()
    nltk_download()
    elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]):
    banner()
    print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download")
    print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\ndiag_act\nnltk_download")
    else:
    banner()
    if ("spc_tok" in sys.argv[1]): # expects "a complete sentence."
    @@ -109,5 +124,7 @@ def main():
    postag()
    if ("ner" in sys.argv[1]): # expects "a complete sentence."
    ner()
    if ("diag_act" in sys.argv[1]): # expects "a complete sentence."
    diag_act()
    ###
    main()
  9. VTSTech revised this gist Mar 12, 2020. No changes.
  10. VTSTech revised this gist Mar 12, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -110,4 +110,4 @@ def main():
    if ("ner" in sys.argv[1]): # expects "a complete sentence."
    ner()
    ###
    main()
    main()
  11. VTSTech revised this gist Mar 12, 2020. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -15,7 +15,7 @@
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

    def banner():
    print("NLTK Script v0.2 03-11-2020\nWritten by veritas@vts-tech.org\n")
    print("NLTK Script v0.2 03-11-2020\nWritten by VTSTech (veritas@vts-tech.org)\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    #print("[+] SpaceTokenizer imported.")
  12. VTSTech revised this gist Mar 12, 2020. 1 changed file with 12 additions and 9 deletions.
    21 changes: 12 additions & 9 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,8 @@
    #v0.1 2020-03-11 9:12:13 PM
    #VTSTech (veritas@vts-tech.org)
    #NLTK Script v0.2 2020-03-11 9:25:03 PM
    #Written by VTSTech (veritas@vts-tech.org)
    #Various functions inspired by code from sentdex/pythonprogramming.net
    #https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/

    import sys, nltk, os, string
    from nltk import pos_tag
    from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
    @@ -12,7 +15,7 @@
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

    def banner():
    print("VTSTech-NLTK Script v0.1\nveritas@vts-tech.org 03-11-2020\n")
    print("NLTK Script v0.2 03-11-2020\nWritten by veritas@vts-tech.org\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    #print("[+] SpaceTokenizer imported.")
    @@ -94,17 +97,17 @@ def main():
    print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download")
    else:
    banner()
    if ("spc_tok" in sys.argv[1]):
    if ("spc_tok" in sys.argv[1]): # expects "a complete sentence."
    spc_tok()
    if ("syn_ant" in sys.argv[1]):
    if ("syn_ant" in sys.argv[1]): # expects word
    syn_ant()
    if ("stem" in sys.argv[1]):
    if ("stem" in sys.argv[1]): # expects word
    stem()
    if ("lemma" in sys.argv[1]):
    if ("lemma" in sys.argv[1]): # expects word
    lemma()
    if ("pos_tag" in sys.argv[1]):
    if ("pos_tag" in sys.argv[1]): # expects "a complete sentence."
    postag()
    if ("ner" in sys.argv[1]):
    if ("ner" in sys.argv[1]): # expects "a complete sentence."
    ner()
    ###
    main()
  13. VTSTech created this gist Mar 12, 2020.
    110 changes: 110 additions & 0 deletions VTSTech-NLTK.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,110 @@
    #v0.1 2020-03-11 9:12:13 PM
    #VTSTech (veritas@vts-tech.org)
    import sys, nltk, os, string
    from nltk import pos_tag
    from nltk.tokenize import sent_tokenize, word_tokenize, SpaceTokenizer, PunktSentenceTokenizer
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.corpus import wordnet, state_union

    synonyms = []
    antonyms = []
    train_text = state_union.raw("1999-Clinton.txt")
    custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

    def banner():
    print("VTSTech-NLTK Script v0.1\nveritas@vts-tech.org 03-11-2020\n")
    print("Usage:", os.path.basename(__file__),"mode \"word or sentence\"\n")
    def spc_tok():
    #print("[+] SpaceTokenizer imported.")
    print("[+] Target:", sys.argv[2])
    print("[+] Output:",end='')
    print(SpaceTokenizer().tokenize(sys.argv[2]))
    print("[+] Complete.")
    def syn_ant():
    #print("[+] wordnet imported.")
    for syn in wordnet.synsets(sys.argv[2]):
    for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
    antonyms.append(l.antonyms()[0].name())
    print("[+] Target:", sys.argv[2])
    target = wordnet.synsets(sys.argv[2])
    if (len(target) >= 1): print("[+] Defined:", target[0].definition())
    print("[+] Output:")
    if (len(set(synonyms))>0):
    print("Syn:", set(synonyms))
    else:
    print("Syn: Failed to find synonym!")
    if (len(set(antonyms))>0):
    print("Ant:", set(antonyms))
    else:
    print("Ant: Failed to find antonym!")
    print("[+] Complete.")
    def stem():
    ps = PorterStemmer()
    #print("[+] PorterStemmer imported.")
    print("[+] Target:",sys.argv[2])
    print("[+] Output: ", end='')
    words = SpaceTokenizer().tokenize(sys.argv[2])
    for w in words:
    print(ps.stem(w))
    print("[+] Complete.")
    def lemma():
    lemmatizer = WordNetLemmatizer()
    #print("[+] WordNetLemmatizer imported.")
    #pos=a,s,v,r
    print("[+] Target:",sys.argv[2])
    print("[+] Output: ",end='')
    if (len(sys.argv) == 4):
    #print("DEBUG:", sys.argv[3])
    print(lemmatizer.lemmatize(sys.argv[2], pos=sys.argv[3]))
    else:
    print(lemmatizer.lemmatize(sys.argv[2]))
    print("[+] Complete.")
    def postag():
    #print("[+] pos_tag, SpaceTokenizer imported.")
    print("[+] Target:", sys.argv[2])
    print("[+] Output:", end='')
    print(pos_tag(SpaceTokenizer().tokenize(sys.argv[2])))
    print("[+] Complete.")
    def nltk_download():
    print("[+] NLTK Downloader launching...\n")
    nltk.download()
    print("[+] Complete.")
    def ner():
    tokenized = custom_sent_tokenizer.tokenize(sys.argv[2])
    try:
    for i in tokenized:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    print("[+] Target:", sys.argv[2])
    print("[+] Output:", end='')
    print(namedEnt)
    except Exception as e:
    print(str(e))
    print("[+] Complete.")
    ###
    def main():
    if (len(sys.argv) == 2) and ("nltk_download" in sys.argv[1]):
    banner()
    nltk_download()
    elif (len(sys.argv) <= 2) and ("VTSTech-NLTK.py" in sys.argv[0]):
    banner()
    print("Modes:\n\nspc_tok\nsyn_ant\nstem\nlemma\npos_tag\nner\nnltk_download")
    else:
    banner()
    if ("spc_tok" in sys.argv[1]):
    spc_tok()
    if ("syn_ant" in sys.argv[1]):
    syn_ant()
    if ("stem" in sys.argv[1]):
    stem()
    if ("lemma" in sys.argv[1]):
    lemma()
    if ("pos_tag" in sys.argv[1]):
    postag()
    if ("ner" in sys.argv[1]):
    ner()
    ###
    main()