Last active
December 12, 2017 00:22
-
-
Save sowmyagowri/da437eddacdf428c92a53bc78bb25e56 to your computer and use it in GitHub Desktop.
Python program for IMDB Movie Review Classification using k nearest neighbor algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from bs4 import BeautifulSoup\n", | |
"from nltk.corpus import stopwords\n", | |
"from nltk.stem import WordNetLemmatizer\n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"import numpy as np\n", | |
"import re\n", | |
"\n", | |
"\n", | |
"#Read the input files and read every line\n", | |
"def loadData(trainingFile, testingFile):\n", | |
" \n", | |
" with open(trainingFile, \"r\") as fr1:\n", | |
" trainFile = fr1.readlines()\n", | |
" \n", | |
" with open(testingFile, \"r\") as fr2:\n", | |
" testFile = fr2.readlines()\n", | |
" \n", | |
" #Split each line in the two files into reviews and labels \n", | |
" train_sentiments_t = [x.split(\"\\t\", 1)[0] for x in trainFile]\n", | |
" train_reviews_t = [x.split(\"\\t\", 1)[1] for x in trainFile]\n", | |
" \n", | |
" return train_reviews, testFile, train_sentiments\n", | |
"\n", | |
"\n", | |
"def clean(reviews):\n", | |
" \n", | |
" \"\"\"Initialize an empty list to hold the clean reviews\"\"\"\n", | |
" clean_train_reviews = []\n", | |
"\n", | |
" # Loop over each review in the list\n", | |
" for index, review in enumerate(reviews):\n", | |
" # Call the pre processer for each review, and add the result to the list of clean reviews\n", | |
" clean_train_reviews.append(preProcess(review))\n", | |
" \n", | |
" return clean_train_reviews\n", | |
" \n", | |
"def preProcess(rawReview):\n", | |
"\n", | |
" \"\"\"Function to convert a raw review to a string of words\n", | |
" Takes in a raw movie review as a single string to output a preprocessed movie review as a single string\"\"\"\n", | |
" \n", | |
" # 1. Remove HTML tags\n", | |
" text_only = BeautifulSoup(rawReview).get_text()\n", | |
" #\n", | |
" # 2. Remove Email IDs, URLs and numbers\n", | |
" noEmail = re.sub(r'([\\w\\.-]+@[\\w\\.-]+\\.\\w+)','',text_only)\n", | |
" \n", | |
" noUrl = re.sub(r'(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]| \\\n", | |
" [a-z0-9.\\-]+[.][a-z]{2,4}/|[a-z0-9.\\-]+[.][a-z])(?:[^\\s()<>]+|\\(([^\\s()<>]+| \\\n", | |
" (\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\\'\".,<>?«»“”‘’]))','', noEmail)\n", | |
" \n", | |
" #Emotional symbols may affect the meaning of the review\n", | |
" smileys = \"\"\":-) :) :o) :D :-D :( :-( :o(\"\"\".split()\n", | |
" smileyPattern = \"|\".join(map(re.escape, smileys))\n", | |
" \n", | |
" letters_only = re.sub(\"[^a-zA-Z\" + smileyPattern + \"]\", \" \", noUrl)\n", | |
" #\n", | |
" # 3. Convert to lower case and split into individual words\n", | |
" words = letters_only.lower().split() \n", | |
" #\n", | |
" # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set\n", | |
" stops = set(stopwords.words(\"english\")) \n", | |
" # \n", | |
" # 5. Remove stop words and also 3-letter words and Lemmatize the review\n", | |
" lemmatizer = WordNetLemmatizer()\n", | |
" lemmatized_words = ''\n", | |
" for word in words:\n", | |
" if word not in stops and len(word) > 3:\n", | |
" #if len(word) > 3:\n", | |
" lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '\n", | |
" #\n", | |
" # 6. Join the words back into one string separated by space and return the result.\n", | |
" return lemmatized_words\n", | |
"\n", | |
"def createTFIDFMatrices(train_data, test_data):\n", | |
" \"\"\"Takes in processed training and testing data, outputs respective L2-normalized sparse matrices with TF-IDF values\"\"\"\n", | |
" \n", | |
" vectorizer = TfidfVectorizer(norm = 'l2')\n", | |
" \n", | |
" train_matrix = vectorizer.fit_transform(train_data)\n", | |
" \n", | |
" #parameters generated from fit() method on train data applied upon model to generate transformed data set of test data\n", | |
" test_matrix = vectorizer.transform(test_data)\n", | |
"\n", | |
" return train_matrix, test_matrix\n", | |
"\n", | |
"def findSimilarities(train_matrix, test_matrix):\n", | |
" \"\"\"Takes in the entire training data and the testing data (both sparse matrices) and \n", | |
" gives the cosine similarity between the two as a numpy array.\n", | |
" Numpy arrays are fastest to work with for sorting while finding nearest neighbors\"\"\"\n", | |
" \n", | |
" cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))\n", | |
" similarities = cosineSimilarities.toarray()\n", | |
" \n", | |
" return similarities\n", | |
"\n", | |
"def findKNearest(similarity_vector, k):\n", | |
" \"\"\"Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.\n", | |
" The input array gets sorted in descending order and the first k indices returned.\n", | |
" The argsort function has been used to preserve the indices of the training reviews so that their respective labels\n", | |
" can be easily referenced in the training labels list\"\"\"\n", | |
" \n", | |
" return np.argsort(-similarity_vector)[:k]\n", | |
" \n", | |
"\n", | |
"def predict(nearestNeighbors, labels):\n", | |
" \"\"\"Takes in the list of K nearest Neighbors and the full training labels list, and \n", | |
" calculates the count of positive and negative reviews. \n", | |
" If positive reviews are more, then the test review is positive and vice-versa\"\"\"\n", | |
" \n", | |
" positiveReviewsCount = 0\n", | |
" negativeReviewsCount = 0\n", | |
" for neighbor in nearestNeighbors:\n", | |
" if int(labels[neighbor]) == 1:\n", | |
" positiveReviewsCount += 1\n", | |
" else:\n", | |
" negativeReviewsCount += 1\n", | |
" if positiveReviewsCount > negativeReviewsCount:\n", | |
" return 1\n", | |
" else:\n", | |
" return -1" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"#Read the training and the test data set and get 3 separate lists of training reviews, test reviews and training labels\n", | |
"train_reviews, test_reviews, train_sentiments = loadData('train.dat', 'test.dat')\n", | |
"\n", | |
"#Pre-process both the training and the test data set\n", | |
"train_reviews = clean(train_reviews)\n", | |
"test_reviews = clean(test_reviews)\n", | |
"\n", | |
"train_matrix, test_matrix = createTFIDFMatrices(train_reviews, test_reviews)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"similarities = findSimilarities(train_matrix, test_matrix)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Pass every row in the numpy array of similarities to predict the sentiment of every review\n", | |
"\n", | |
"k = 300\n", | |
"test_sentiments = list()\n", | |
"\n", | |
"for similarity in similarities:\n", | |
" knn = findKNearest(similarity, k)\n", | |
" prediction = predict(knn, train_sentiments)\n", | |
" \n", | |
" #To write to the list as +1 instead of just a 1 for positive reviews\n", | |
" if prediction == 1:\n", | |
" test_sentiments.append('+1')\n", | |
" else:\n", | |
" test_sentiments.append('-1')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#Write the result to a .dat file\n", | |
"output = open('output-k-300.dat', 'w')\n", | |
"\n", | |
"output.writelines( \"%s\\n\" % item for item in test_sentiments )\n", | |
"\n", | |
"output.close()" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment