Last active
October 30, 2017 13:07
-
-
Save chumpblocckami/f7e8e3c8b7f7c60689bbb1207ffe997c to your computer and use it in GitHub Desktop.
Baesyan_classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk import * | |
import random | |
from nltk.corpus.reader import CategorizedPlaintextCorpusReader | |
file = "dataset" | |
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*',encoding="utf8") | |
documents=[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories() | |
for fileid in collCategorized.fileids(category)] | |
random.shuffle(documents) | |
listaFreq=nltk.FreqDist(w.lower() for w in collCategorized.words()) | |
word_features=[x for (x,_) in listaFreq.most_common()] | |
def document_features(document): | |
document_words=set(document) | |
features={} | |
for word in word_features: | |
features["contains({})".format(word)]=(word in document_words) | |
return features | |
featuresets_doc=[(document_features(d), c) for (d, c) in documents] | |
size = int(len(featuresets_doc) * 0.8) | |
train_set_doc, test_set_doc = featuresets_doc[size:], featuresets_doc[:size] | |
classifier_doc=nltk.NaiveBayesClassifier.train(train_set_doc) | |
print("accuracy: {}".format(nltk.classify.accuracy(classifier_doc, test_set_doc)*100)) | |
classifier_doc.show_most_informative_features(10) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from nltk import * | |
from nltk.corpus.reader import CategorizedPlaintextCorpusReader | |
from nltk.corpus import wordnet as wn | |
import pandas as pd | |
import re, string | |
from nltk.stem import WordNetLemmatizer | |
wnl = WordNetLemmatizer() | |
def univ_pos_changer(stringa): | |
if stringa=="NOUN": | |
return "n" | |
elif stringa=="VERB": | |
return "v" | |
elif stringa=="ADJ": | |
return "a" | |
elif stringa=="ADV": | |
return "r" | |
else: | |
return None | |
def my_lemmatizer(frase): | |
lista1 = [] | |
#dev = [] | |
for parole in frase: | |
word =parole.split("_")[0].lower() | |
tag = parole.split("_")[1] | |
#dev.append("{} + {}".format(word,tag)) | |
if univ_pos_changer(tag) == None: | |
lista1.append(str(word).lower()+"_"+str(tag)) | |
#print("Nltk: "+word + " "+ wnl.lemmatize(word)) | |
#dev.append("NLTK_"+word+"_"+tag) | |
elif(wn.morphy(word,univ_pos_changer(tag)) == None): | |
lista1.append(wnl.lemmatize(word)+"_"+tag) | |
#print("Error: "+word + " "+ wnl.lemmatize(word)) | |
#dev.append("ERR_"+word+"_"+tag) | |
else: | |
lista1.append(wn.morphy(word,univ_pos_changer(tag))+"_"+tag) | |
#print("Morphy: "+word + " "+wn.morphy(word,univ_pos_changer(tag))) | |
#dev.append("CORR_"+wn.morphy(word,univ_pos_changer(tag))+"_"+tag) | |
return lista1 | |
import nltk | |
from nltk import * | |
import random | |
from nltk.corpus.reader import CategorizedPlaintextCorpusReader | |
documents_pos = [] | |
words_ = [] | |
words = [] | |
file = "dataset" | |
collCategorized= CategorizedPlaintextCorpusReader(file, r'.*\.txt', cat_pattern=r'(\w+)/*')#togliere la codifica | |
documents1 =[(list(collCategorized.words(fileid)), category) for category in collCategorized.categories() for fileid in collCategorized.fileids(category)] | |
random.shuffle(documents1) | |
corpus = [] | |
corpus_tagged = [] | |
i = 0; | |
for j in documents1: | |
text = j[0] | |
cat = j[1] | |
tagged = nltk.pos_tag(word_tokenize(" ".join(text)),tagset="universal") | |
corpus.append(([a.replace("_","")+"_"+b for a,b in tagged],cat)) | |
corpus_tagged.append((my_lemmatizer(corpus[i][0]),corpus[i][1])) | |
i = i+1 | |
words = [] | |
for articoli in corpus_tagged: | |
for testo in articoli[0]: | |
words.append(testo) | |
listaFreq=nltk.FreqDist(w for w in words) | |
word_features=[x for (x,_) in listaFreq.most_common()[75:575]] #metto ultime features | |
def document_features(document): | |
document_words=set(document) | |
features={} | |
for word in word_features: | |
features["contains({})".format(word)]=(word in document_words) | |
return features | |
featuresets_doc=[(document_features(d), c) for (d, c) in corpus_tagged] | |
size = int(len(featuresets_doc) * 0.8) | |
train_set_doc, test_set_doc = featuresets_doc[size:], featuresets_doc[:size] | |
classifier_doc=nltk.NaiveBayesClassifier.train(train_set_doc) | |
print(nltk.classify.accuracy(classifier_doc, test_set_doc)*100) | |
classifier_doc.show_most_informative_features(10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment