Last active
September 14, 2019 06:08
-
-
Save chaitanyarahalkar/9212b392f2e61cf732598ab63994b37f to your computer and use it in GitHub Desktop.
Movie Review Analysis - Natural Language Processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
import re | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import confusion_matrix | |
import requests | |
import os | |
# Downloading the stopwords | |
nltk.download('stopwords') | |
# Downloading the dataset | |
if not os.path.exists("Restaurant_Reviews.tsv"): | |
r = requests.get('https://media.geeksforgeeks.org/wp-content/uploads/Restaurant_Reviews.tsv') | |
with open("Restaurant_Reviews.tsv",'wb') as f: | |
f.write(r.content) | |
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t') | |
# Cleaning the dataset | |
corpus = list() | |
ps = PorterStemmer() | |
stopwords = set(stopwords.words('english')) | |
for i in range(0,1000): | |
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i]) # Replacing numbers with spaces | |
review = review.lower() # Converting all letters to lowercase | |
review = review.split() | |
review = [ps.stem(word) for word in review if not word in stopwords] # Removing stopwords | |
review = " ".join(review) | |
corpus.append(review) | |
# Vectorizing the corpus data | |
cv = CountVectorizer(max_features=1500) | |
# Preparing the dataset for training | |
X = cv.fit_transform(corpus).toarray() | |
y = dataset.iloc[:,1].values | |
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25) | |
# Using Random Forest classifier for predictions | |
model = RandomForestClassifier(n_estimators=501,criterion='entropy') | |
model.fit(X_train,y_train) | |
# Performing predictions | |
y_pred = model.predict(X_test) | |
print("Accuracy: ", (y_pred == y_test).sum() / len(y_test) * 100,"%") | |
# Generating confusion matrix | |
print("Confusion matrix:") | |
cols = ["Predicted No","Predicted Yes"] | |
rows = ["Actual No","Actual Yes"] | |
cm = confusion_matrix(y_test,y_pred) | |
df = pd.DataFrame(cm,columns=cols,index=rows) | |
print(df) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nltk | |
numpy | |
pandas | |
sklearn | |
requests |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment