chaitanyarahalkar/nlp.py

## nlp.py
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import requests
import os

# Downloading the stopwords
nltk.download('stopwords')

# Downloading the dataset
if not os.path.exists("Restaurant_Reviews.tsv"):
	r = requests.get('https://media.geeksforgeeks.org/wp-content/uploads/Restaurant_Reviews.tsv')
	with open("Restaurant_Reviews.tsv",'wb') as f:
		f.write(r.content)

dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')

# Cleaning the dataset

corpus = list()
ps = PorterStemmer()
stopwords = set(stopwords.words('english'))

for i in range(0,1000):
	review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i]) # Replacing numbers with spaces
	review = review.lower() # Converting all letters to lowercase
	review = review.split()
	review = [ps.stem(word) for word in review if not word in stopwords] # Removing stopwords

	review = " ".join(review)
	corpus.append(review)

# Vectorizing the corpus data

cv = CountVectorizer(max_features=1500)

# Preparing the dataset for training
X = cv.fit_transform(corpus).toarray()

y = dataset.iloc[:,1].values

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

# Using Random Forest classifier for predictions

model = RandomForestClassifier(n_estimators=501,criterion='entropy')

model.fit(X_train,y_train)

# Performing predictions
y_pred = model.predict(X_test)

print("Accuracy: ", (y_pred == y_test).sum() / len(y_test) * 100,"%")

# Generating confusion matrix
print("Confusion matrix:")

cols = ["Predicted No","Predicted Yes"]
rows = ["Actual No","Actual Yes"]

cm = confusion_matrix(y_test,y_pred)
df = pd.DataFrame(cm,columns=cols,index=rows)
print(df)

## requirements.txt
nltk
numpy
pandas
sklearn
requests
	import numpy as np
	import pandas as pd
	import re
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import confusion_matrix
	import requests
	import os

	# Downloading the stopwords
	nltk.download('stopwords')

	# Downloading the dataset
	if not os.path.exists("Restaurant_Reviews.tsv"):
	r = requests.get('https://media.geeksforgeeks.org/wp-content/uploads/Restaurant_Reviews.tsv')
	with open("Restaurant_Reviews.tsv",'wb') as f:
	f.write(r.content)

	dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')

	# Cleaning the dataset

	corpus = list()
	ps = PorterStemmer()
	stopwords = set(stopwords.words('english'))

	for i in range(0,1000):
	review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i]) # Replacing numbers with spaces
	review = review.lower() # Converting all letters to lowercase
	review = review.split()
	review = [ps.stem(word) for word in review if not word in stopwords] # Removing stopwords

	review = " ".join(review)
	corpus.append(review)

	# Vectorizing the corpus data

	cv = CountVectorizer(max_features=1500)

	# Preparing the dataset for training
	X = cv.fit_transform(corpus).toarray()

	y = dataset.iloc[:,1].values

	X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

	# Using Random Forest classifier for predictions

	model = RandomForestClassifier(n_estimators=501,criterion='entropy')

	model.fit(X_train,y_train)

	# Performing predictions
	y_pred = model.predict(X_test)

	print("Accuracy: ", (y_pred == y_test).sum() / len(y_test) * 100,"%")

	# Generating confusion matrix
	print("Confusion matrix:")

	cols = ["Predicted No","Predicted Yes"]
	rows = ["Actual No","Actual Yes"]

	cm = confusion_matrix(y_test,y_pred)
	df = pd.DataFrame(cm,columns=cols,index=rows)
	print(df)