This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from tqdm import tqdm | |
import pandas as pd | |
import numpy as np | |
import re | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score | |
from sklearn.utils import shuffle | |
from sklearn.linear_model import LogisticRegression |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from tqdm import tqdm | |
import pandas as pd | |
import numpy as np | |
import re | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score | |
from sklearn.utils import shuffle | |
from sklearn.linear_model import LogisticRegression |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class LabeledLineSentence(): | |
def __init__(self,fileName): | |
self.fileName = fileName | |
def __iter__(self): | |
df = pd.read_csv(self.fileName) | |
text = df['text'].values | |
for idx, doc in tqdm(enumerate(text)): | |
doc = self.preprocess(doc) | |
yield TaggedDocument(words=doc.split(),tags=[idx]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
iterator = LabeledLineSentence('..') | |
model = Doc2Vec(iterator,min_count=1, vector_size=250, sample=1e-4, negative=6 ,workers=4,epochs=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.train(iterator, total_examples=model.corpus_count, epochs=5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.wv.most_similar('wine') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.wv.most_similar('wine') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.wv.most_similar('food') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rating = [] | |
df = pd.read_csv('..') | |
rating = df['target'].values | |
del df | |
''' | |
These values are determined by len of your dataset. If you have 25k as length then you can use 20k for training and 5k for testing. | |
''' | |
train_arrays = np.ones((20000, 250)) | |
train_labels = np.ones(20000,dtype='int') | |
test_arrays = np.zeros((5000, 250)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
classifier = LogisticRegression(solver='lbfgs',C=0.5) | |
classifier.fit(train_arrays,train_labels) | |
pred = classifier.predict(test_arrays) | |
''' | |
You can check the accuracy of model using sklearn's confusion matrix or accuracy_score function. | |
''' |
OlderNewer