Skip to content

Instantly share code, notes, and snippets.

View gaganmanku96's full-sized avatar

Gagandeep Singh gaganmanku96

  • Gurgaon
View GitHub Profile
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
@gaganmanku96
gaganmanku96 / packages.py
Created May 19, 2019 10:11
importing packages
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
class LabeledLineSentence():
def __init__(self,fileName):
self.fileName = fileName
def __iter__(self):
df = pd.read_csv(self.fileName)
text = df['text'].values
for idx, doc in tqdm(enumerate(text)):
doc = self.preprocess(doc)
yield TaggedDocument(words=doc.split(),tags=[idx])
iterator = LabeledLineSentence('..')
model = Doc2Vec(iterator,min_count=1, vector_size=250, sample=1e-4, negative=6 ,workers=4,epochs=2)
model.train(iterator, total_examples=model.corpus_count, epochs=5)
model.wv.most_similar('wine')
model.wv.most_similar('wine')
model.wv.most_similar('food')
rating = []
df = pd.read_csv('..')
rating = df['target'].values
del df
'''
These values are determined by len of your dataset. If you have 25k as length then you can use 20k for training and 5k for testing.
'''
train_arrays = np.ones((20000, 250))
train_labels = np.ones(20000,dtype='int')
test_arrays = np.zeros((5000, 250))
classifier = LogisticRegression(solver='lbfgs',C=0.5)
classifier.fit(train_arrays,train_labels)
pred = classifier.predict(test_arrays)
'''
You can check the accuracy of model using sklearn's confusion matrix or accuracy_score function.
'''