Skip to content

Instantly share code, notes, and snippets.

@sowmyagowri
Last active December 12, 2017 00:22
Show Gist options
  • Save sowmyagowri/da437eddacdf428c92a53bc78bb25e56 to your computer and use it in GitHub Desktop.
Save sowmyagowri/da437eddacdf428c92a53bc78bb25e56 to your computer and use it in GitHub Desktop.
Python program for IMDB Movie Review Classification using k nearest neighbor algorithm
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import WordNetLemmatizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"import numpy as np\n",
"import re\n",
"\n",
"\n",
"#Read the input files and read every line\n",
"def loadData(trainingFile, testingFile):\n",
" \n",
" with open(trainingFile, \"r\") as fr1:\n",
" trainFile = fr1.readlines()\n",
" \n",
" with open(testingFile, \"r\") as fr2:\n",
" testFile = fr2.readlines()\n",
" \n",
" #Split each line in the two files into reviews and labels \n",
" train_sentiments_t = [x.split(\"\\t\", 1)[0] for x in trainFile]\n",
" train_reviews_t = [x.split(\"\\t\", 1)[1] for x in trainFile]\n",
" \n",
" return train_reviews, testFile, train_sentiments\n",
"\n",
"\n",
"def clean(reviews):\n",
" \n",
" \"\"\"Initialize an empty list to hold the clean reviews\"\"\"\n",
" clean_train_reviews = []\n",
"\n",
" # Loop over each review in the list\n",
" for index, review in enumerate(reviews):\n",
" # Call the pre processer for each review, and add the result to the list of clean reviews\n",
" clean_train_reviews.append(preProcess(review))\n",
" \n",
" return clean_train_reviews\n",
" \n",
"def preProcess(rawReview):\n",
"\n",
" \"\"\"Function to convert a raw review to a string of words\n",
" Takes in a raw movie review as a single string to output a preprocessed movie review as a single string\"\"\"\n",
" \n",
" # 1. Remove HTML tags\n",
" text_only = BeautifulSoup(rawReview).get_text()\n",
" #\n",
" # 2. Remove Email IDs, URLs and numbers\n",
" noEmail = re.sub(r'([\\w\\.-]+@[\\w\\.-]+\\.\\w+)','',text_only)\n",
" \n",
" noUrl = re.sub(r'(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]| \\\n",
" [a-z0-9.\\-]+[.][a-z]{2,4}/|[a-z0-9.\\-]+[.][a-z])(?:[^\\s()<>]+|\\(([^\\s()<>]+| \\\n",
" (\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\\'\".,<>?«»“”‘’]))','', noEmail)\n",
" \n",
" #Emotional symbols may affect the meaning of the review\n",
" smileys = \"\"\":-) :) :o) :D :-D :( :-( :o(\"\"\".split()\n",
" smileyPattern = \"|\".join(map(re.escape, smileys))\n",
" \n",
" letters_only = re.sub(\"[^a-zA-Z\" + smileyPattern + \"]\", \" \", noUrl)\n",
" #\n",
" # 3. Convert to lower case and split into individual words\n",
" words = letters_only.lower().split() \n",
" #\n",
" # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set\n",
" stops = set(stopwords.words(\"english\")) \n",
" # \n",
" # 5. Remove stop words and also 3-letter words and Lemmatize the review\n",
" lemmatizer = WordNetLemmatizer()\n",
" lemmatized_words = ''\n",
" for word in words:\n",
" if word not in stops and len(word) > 3:\n",
" #if len(word) > 3:\n",
" lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '\n",
" #\n",
" # 6. Join the words back into one string separated by space and return the result.\n",
" return lemmatized_words\n",
"\n",
"def createTFIDFMatrices(train_data, test_data):\n",
" \"\"\"Takes in processed training and testing data, outputs respective L2-normalized sparse matrices with TF-IDF values\"\"\"\n",
" \n",
" vectorizer = TfidfVectorizer(norm = 'l2')\n",
" \n",
" train_matrix = vectorizer.fit_transform(train_data)\n",
" \n",
" #parameters generated from fit() method on train data applied upon model to generate transformed data set of test data\n",
" test_matrix = vectorizer.transform(test_data)\n",
"\n",
" return train_matrix, test_matrix\n",
"\n",
"def findSimilarities(train_matrix, test_matrix):\n",
" \"\"\"Takes in the entire training data and the testing data (both sparse matrices) and \n",
" gives the cosine similarity between the two as a numpy array.\n",
" Numpy arrays are fastest to work with for sorting while finding nearest neighbors\"\"\"\n",
" \n",
" cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))\n",
" similarities = cosineSimilarities.toarray()\n",
" \n",
" return similarities\n",
"\n",
"def findKNearest(similarity_vector, k):\n",
" \"\"\"Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.\n",
" The input array gets sorted in descending order and the first k indices returned.\n",
" The argsort function has been used to preserve the indices of the training reviews so that their respective labels\n",
" can be easily referenced in the training labels list\"\"\"\n",
" \n",
" return np.argsort(-similarity_vector)[:k]\n",
" \n",
"\n",
"def predict(nearestNeighbors, labels):\n",
" \"\"\"Takes in the list of K nearest Neighbors and the full training labels list, and \n",
" calculates the count of positive and negative reviews. \n",
" If positive reviews are more, then the test review is positive and vice-versa\"\"\"\n",
" \n",
" positiveReviewsCount = 0\n",
" negativeReviewsCount = 0\n",
" for neighbor in nearestNeighbors:\n",
" if int(labels[neighbor]) == 1:\n",
" positiveReviewsCount += 1\n",
" else:\n",
" negativeReviewsCount += 1\n",
" if positiveReviewsCount > negativeReviewsCount:\n",
" return 1\n",
" else:\n",
" return -1"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"#Read the training and the test data set and get 3 separate lists of training reviews, test reviews and training labels\n",
"train_reviews, test_reviews, train_sentiments = loadData('train.dat', 'test.dat')\n",
"\n",
"#Pre-process both the training and the test data set\n",
"train_reviews = clean(train_reviews)\n",
"test_reviews = clean(test_reviews)\n",
"\n",
"train_matrix, test_matrix = createTFIDFMatrices(train_reviews, test_reviews)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"similarities = findSimilarities(train_matrix, test_matrix)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Pass every row in the numpy array of similarities to predict the sentiment of every review\n",
"\n",
"k = 300\n",
"test_sentiments = list()\n",
"\n",
"for similarity in similarities:\n",
" knn = findKNearest(similarity, k)\n",
" prediction = predict(knn, train_sentiments)\n",
" \n",
" #To write to the list as +1 instead of just a 1 for positive reviews\n",
" if prediction == 1:\n",
" test_sentiments.append('+1')\n",
" else:\n",
" test_sentiments.append('-1')"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Write the result to a .dat file\n",
"output = open('output-k-300.dat', 'w')\n",
"\n",
"output.writelines( \"%s\\n\" % item for item in test_sentiments )\n",
"\n",
"output.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment