Skip to content

Instantly share code, notes, and snippets.

@chumpblocckami
Last active October 30, 2017 13:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save chumpblocckami/f7e8e3c8b7f7c60689bbb1207ffe997c to your computer and use it in GitHub Desktop.
Save chumpblocckami/f7e8e3c8b7f7c60689bbb1207ffe997c to your computer and use it in GitHub Desktop.
Baesyan_classifier
import nltk
from nltk import *
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
file = "dataset"
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*',encoding="utf8")
documents=[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories()
for fileid in collCategorized.fileids(category)]
random.shuffle(documents)
listaFreq=nltk.FreqDist(w.lower() for w in collCategorized.words())
word_features=[x for (x,_) in listaFreq.most_common()]
def document_features(document):
document_words=set(document)
features={}
for word in word_features:
features["contains({})".format(word)]=(word in document_words)
return features
featuresets_doc=[(document_features(d), c) for (d, c) in documents]
size = int(len(featuresets_doc) * 0.8)
train_set_doc, test_set_doc = featuresets_doc[size:], featuresets_doc[:size]
classifier_doc=nltk.NaiveBayesClassifier.train(train_set_doc)
print("accuracy: {}".format(nltk.classify.accuracy(classifier_doc, test_set_doc)*100))
classifier_doc.show_most_informative_features(10)
import nltk
from nltk import *
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.corpus import wordnet as wn
import pandas as pd
import re, string
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
def univ_pos_changer(stringa):
if stringa=="NOUN":
return "n"
elif stringa=="VERB":
return "v"
elif stringa=="ADJ":
return "a"
elif stringa=="ADV":
return "r"
else:
return None
def my_lemmatizer(frase):
lista1 = []
#dev = []
for parole in frase:
word =parole.split("_")[0].lower()
tag = parole.split("_")[1]
#dev.append("{} + {}".format(word,tag))
if univ_pos_changer(tag) == None:
lista1.append(str(word).lower()+"_"+str(tag))
#print("Nltk: "+word + " "+ wnl.lemmatize(word))
#dev.append("NLTK_"+word+"_"+tag)
elif(wn.morphy(word,univ_pos_changer(tag)) == None):
lista1.append(wnl.lemmatize(word)+"_"+tag)
#print("Error: "+word + " "+ wnl.lemmatize(word))
#dev.append("ERR_"+word+"_"+tag)
else:
lista1.append(wn.morphy(word,univ_pos_changer(tag))+"_"+tag)
#print("Morphy: "+word + " "+wn.morphy(word,univ_pos_changer(tag)))
#dev.append("CORR_"+wn.morphy(word,univ_pos_changer(tag))+"_"+tag)
return lista1
import nltk
from nltk import *
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
documents_pos = []
words_ = []
words = []
file = "dataset"
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*')#togliere la codifica
documents1 =[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories() for fileid in collCategorized.fileids(category)]
random.shuffle(documents1)
corpus = []
corpus_tagged = []
i = 0;
for j in documents1:
text = j[0]
cat = j[1]
tagged = nltk.pos_tag(word_tokenize(" ".join(text)),tagset="universal")
corpus.append(([a.replace("_","")+"_"+b for a,b in tagged],cat))
corpus_tagged.append((my_lemmatizer(corpus[i][0]),corpus[i][1]))
i = i+1
words = []
for articoli in corpus_tagged:
for testo in articoli[0]:
words.append(testo)
listaFreq=nltk.FreqDist(w for w in words)
word_features=[x for (x,_) in listaFreq.most_common()[75:575]] #metto ultime features
def document_features(document):
document_words=set(document)
features={}
for word in word_features:
features["contains({})".format(word)]=(word in document_words)
return features
featuresets_doc=[(document_features(d), c) for (d, c) in corpus_tagged]
size = int(len(featuresets_doc) * 0.8)
train_set_doc, test_set_doc = featuresets_doc[size:], featuresets_doc[:size]
classifier_doc=nltk.NaiveBayesClassifier.train(train_set_doc)
print(nltk.classify.accuracy(classifier_doc, test_set_doc)*100)
classifier_doc.show_most_informative_features(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment