Skip to content

Instantly share code, notes, and snippets.

@chaitanyarahalkar
Last active September 14, 2019 06:08
Show Gist options
  • Save chaitanyarahalkar/9212b392f2e61cf732598ab63994b37f to your computer and use it in GitHub Desktop.
Save chaitanyarahalkar/9212b392f2e61cf732598ab63994b37f to your computer and use it in GitHub Desktop.
Movie Review Analysis - Natural Language Processing
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import requests
import os
# Downloading the stopwords
nltk.download('stopwords')
# Downloading the dataset
if not os.path.exists("Restaurant_Reviews.tsv"):
r = requests.get('https://media.geeksforgeeks.org/wp-content/uploads/Restaurant_Reviews.tsv')
with open("Restaurant_Reviews.tsv",'wb') as f:
f.write(r.content)
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')
# Cleaning the dataset
corpus = list()
ps = PorterStemmer()
stopwords = set(stopwords.words('english'))
for i in range(0,1000):
review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i]) # Replacing numbers with spaces
review = review.lower() # Converting all letters to lowercase
review = review.split()
review = [ps.stem(word) for word in review if not word in stopwords] # Removing stopwords
review = " ".join(review)
corpus.append(review)
# Vectorizing the corpus data
cv = CountVectorizer(max_features=1500)
# Preparing the dataset for training
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)
# Using Random Forest classifier for predictions
model = RandomForestClassifier(n_estimators=501,criterion='entropy')
model.fit(X_train,y_train)
# Performing predictions
y_pred = model.predict(X_test)
print("Accuracy: ", (y_pred == y_test).sum() / len(y_test) * 100,"%")
# Generating confusion matrix
print("Confusion matrix:")
cols = ["Predicted No","Predicted Yes"]
rows = ["Actual No","Actual Yes"]
cm = confusion_matrix(y_test,y_pred)
df = pd.DataFrame(cm,columns=cols,index=rows)
print(df)
nltk
numpy
pandas
sklearn
requests
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment