Skip to content

Instantly share code, notes, and snippets.

View Tathagatd96's full-sized avatar

Tathagat Dasgupta Tathagatd96

View GitHub Profile
sklearn.datasets.load_files("C://Users/Tathagat Dasgupta/Desktop/ML Project/20news-18828")
categories=['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
print "hello"
twenty_train=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)
#twenty_train.target_names=['alt.atheism','comp.graphics','sci.med','soc.religion.christian']
print len(twenty_train.data)
#tf-idf
tfidf_transformer=TfidfTransformer()
X_train_tfidf=tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
#Classifier Training
clf=MultinomialNB().fit(X_train_tfidf,twenty_train.target)
docs_new=['God is love','OpenGL on the GPU is fast']
X_new_counts=count_vect.transform(docs_new)
X_new_tfidf=tfidf_transformer.transform(X_new_counts)
predicted=clf.predict(X_new_tfidf)
#Performance on test set
twenty_test=fetch_20newsgroups(subset='test',categories=categories,shuffle=True,random_state=42)
doc_test=twenty_test.data
predicted=text_clf.predict(doc_test)
print "Classifier Accuracy:"
print(np.mean(predicted==twenty_test.target))
from sklearn.datasets import fetch_20newsgroups
import sklearn.datasets
from sklearn.feature_extraction.text import CountVectorizer,CharNGramAnalyzer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.svm.sparse import LinearSVC
import numpy as np
sklearn.datasets.load_files("C://Users/Tathagat Dasgupta/Desktop/ML Project/20news-18828")
categories=['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
print "hello"
twenty_train=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)
print len(twenty_train.data)
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])
#Preprocessing
#Tokenizing text
from sklearn.datasets import fetch_20newsgroups
import sklearn.datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
sklearn.datasets.load_files("C://Users/Tathagat Dasgupta/Desktop/ML Project/20news-18828")
categories=['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
print "hello"
twenty_train=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42)
print len(twenty_train.data)
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print(twenty_train.target_names[twenty_train.target[0]])
print(twenty_train.target[:10])
for t in twenty_train.target[:10]:
print(twenty_train.target_names[t])