sowmyagowri/Classifier.ipynb

## Classifier.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import numpy as np\n",
    "import re\n",
    "\n",
    "\n",
    "#Read the input files and read every line\n",
    "def loadData(trainingFile, testingFile):\n",
    "    \n",
    "    with open(trainingFile, \"r\") as fr1:\n",
    "        trainFile = fr1.readlines()\n",
    "    \n",
    "    with open(testingFile, \"r\") as fr2:\n",
    "        testFile = fr2.readlines()\n",
    "    \n",
    "    #Split each line in the two files into reviews and labels  \n",
    "    train_sentiments_t = [x.split(\"\\t\", 1)[0] for x in trainFile]\n",
    "    train_reviews_t = [x.split(\"\\t\", 1)[1] for x in trainFile]\n",
    "    \n",
    "    return train_reviews, testFile, train_sentiments\n",
    "\n",
    "\n",
    "def clean(reviews):\n",
    "    \n",
    "    \"\"\"Initialize an empty list to hold the clean reviews\"\"\"\n",
    "    clean_train_reviews = []\n",
    "\n",
    "    # Loop over each review in the list\n",
    "    for index, review in enumerate(reviews):\n",
    "        # Call the pre processer for each review, and add the result to the list of clean reviews\n",
    "        clean_train_reviews.append(preProcess(review))\n",
    "    \n",
    "    return clean_train_reviews\n",
    " \n",
    "def preProcess(rawReview):\n",
    "\n",
    "    \"\"\"Function to convert a raw review to a string of words\n",
    "        Takes in a raw movie review as a single string to output a preprocessed movie review as a single string\"\"\"\n",
    "    \n",
    "    # 1. Remove HTML tags\n",
    "    text_only = BeautifulSoup(rawReview).get_text()\n",
    "    #\n",
    "    # 2. Remove Email IDs, URLs and numbers\n",
    "    noEmail = re.sub(r'([\\w\\.-]+@[\\w\\.-]+\\.\\w+)','',text_only)\n",
    "    \n",
    "    noUrl = re.sub(r'(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}|[a-z0-9%])|www\\d{0,3}[.]| \\\n",
    "        [a-z0-9.\\-]+[.][a-z]{2,4}/|[a-z0-9.\\-]+[.][a-z])(?:[^\\s()<>]+|\\(([^\\s()<>]+| \\\n",
    "        (\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\\'\".,<>?«»“”‘’]))','', noEmail)\n",
    "    \n",
    "    #Emotional symbols may affect the meaning of the review\n",
    "    smileys = \"\"\":-) :) :o) :D :-D :( :-( :o(\"\"\".split()\n",
    "    smileyPattern = \"|\".join(map(re.escape, smileys))\n",
    "    \n",
    "    letters_only = re.sub(\"[^a-zA-Z\" + smileyPattern + \"]\", \" \", noUrl)\n",
    "    #\n",
    "    # 3. Convert to lower case and split into individual words\n",
    "    words = letters_only.lower().split()     \n",
    "    #\n",
    "    # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set\n",
    "    stops = set(stopwords.words(\"english\"))                  \n",
    "    # \n",
    "    # 5. Remove stop words and also 3-letter words and Lemmatize the review\n",
    "    lemmatizer = WordNetLemmatizer()\n",
    "    lemmatized_words = ''\n",
    "    for word in words:\n",
    "        if word not in stops and len(word) > 3:\n",
    "        #if len(word) > 3:\n",
    "            lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '\n",
    "    #\n",
    "    # 6. Join the words back into one string separated by space and return the result.\n",
    "    return lemmatized_words\n",
    "\n",
    "def createTFIDFMatrices(train_data, test_data):\n",
    "    \"\"\"Takes in processed training and testing data, outputs respective L2-normalized sparse matrices with TF-IDF values\"\"\"\n",
    "    \n",
    "    vectorizer = TfidfVectorizer(norm = 'l2')\n",
    "    \n",
    "    train_matrix = vectorizer.fit_transform(train_data)\n",
    "    \n",
    "    #parameters generated from fit() method on train data applied upon model to generate transformed data set of test data\n",
    "    test_matrix = vectorizer.transform(test_data)\n",
    "\n",
    "    return train_matrix, test_matrix\n",
    "\n",
    "def findSimilarities(train_matrix, test_matrix):\n",
    "    \"\"\"Takes in the entire training data and the testing data (both sparse matrices) and \n",
    "        gives the cosine similarity between the two as a numpy array.\n",
    "        Numpy arrays are fastest to work with for sorting while finding nearest neighbors\"\"\"\n",
    "    \n",
    "    cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))\n",
    "    similarities = cosineSimilarities.toarray()\n",
    "        \n",
    "    return similarities\n",
    "\n",
    "def findKNearest(similarity_vector, k):\n",
    "    \"\"\"Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.\n",
    "        The input array gets sorted in descending order and the first k indices returned.\n",
    "        The argsort function has been used to preserve the indices of the training reviews so that their respective labels\n",
    "        can be easily referenced in the training labels list\"\"\"\n",
    "   \n",
    "    return np.argsort(-similarity_vector)[:k]\n",
    "     \n",
    "\n",
    "def predict(nearestNeighbors, labels):\n",
    "    \"\"\"Takes in the list of K nearest Neighbors and the full training labels list, and \n",
    "        calculates the count of positive and negative reviews. \n",
    "        If positive reviews are more, then the test review is positive and vice-versa\"\"\"\n",
    "    \n",
    "    positiveReviewsCount = 0\n",
    "    negativeReviewsCount = 0\n",
    "    for neighbor in nearestNeighbors:\n",
    "        if int(labels[neighbor]) == 1:\n",
    "            positiveReviewsCount += 1\n",
    "        else:\n",
    "            negativeReviewsCount += 1\n",
    "    if positiveReviewsCount > negativeReviewsCount:\n",
    "        return 1\n",
    "    else:\n",
    "        return -1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Read the training and the test data set and get 3 separate lists of training reviews, test reviews and training labels\n",
    "train_reviews, test_reviews, train_sentiments = loadData('train.dat', 'test.dat')\n",
    "\n",
    "#Pre-process both the training and the test data set\n",
    "train_reviews = clean(train_reviews)\n",
    "test_reviews = clean(test_reviews)\n",
    "\n",
    "train_matrix, test_matrix = createTFIDFMatrices(train_reviews, test_reviews)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "similarities = findSimilarities(train_matrix, test_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Pass every row in the numpy array of similarities to predict the sentiment of every review\n",
    "\n",
    "k = 300\n",
    "test_sentiments = list()\n",
    "\n",
    "for similarity in similarities:\n",
    "    knn = findKNearest(similarity, k)\n",
    "    prediction = predict(knn, train_sentiments)\n",
    "    \n",
    "    #To write to the list as +1 instead of just a 1 for positive reviews\n",
    "    if prediction == 1:\n",
    "        test_sentiments.append('+1')\n",
    "    else:\n",
    "        test_sentiments.append('-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Write the result to a .dat file\n",
    "output = open('output-k-300.dat', 'w')\n",
    "\n",
    "output.writelines( \"%s\\n\" % item for item in test_sentiments )\n",
    "\n",
    "output.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {},
	"outputs": [],
	"source": [
	"from bs4 import BeautifulSoup\n",
	"from nltk.corpus import stopwords\n",
	"from nltk.stem import WordNetLemmatizer\n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"import numpy as np\n",
	"import re\n",
	"\n",
	"\n",
	"#Read the input files and read every line\n",
	"def loadData(trainingFile, testingFile):\n",
	" \n",
	" with open(trainingFile, \"r\") as fr1:\n",
	" trainFile = fr1.readlines()\n",
	" \n",
	" with open(testingFile, \"r\") as fr2:\n",
	" testFile = fr2.readlines()\n",
	" \n",
	" #Split each line in the two files into reviews and labels \n",
	" train_sentiments_t = [x.split(\"\\t\", 1)[0] for x in trainFile]\n",
	" train_reviews_t = [x.split(\"\\t\", 1)[1] for x in trainFile]\n",
	" \n",
	" return train_reviews, testFile, train_sentiments\n",
	"\n",
	"\n",
	"def clean(reviews):\n",
	" \n",
	" \"\"\"Initialize an empty list to hold the clean reviews\"\"\"\n",
	" clean_train_reviews = []\n",
	"\n",
	" # Loop over each review in the list\n",
	" for index, review in enumerate(reviews):\n",
	" # Call the pre processer for each review, and add the result to the list of clean reviews\n",
	" clean_train_reviews.append(preProcess(review))\n",
	" \n",
	" return clean_train_reviews\n",
	" \n",
	"def preProcess(rawReview):\n",
	"\n",
	" \"\"\"Function to convert a raw review to a string of words\n",
	" Takes in a raw movie review as a single string to output a preprocessed movie review as a single string\"\"\"\n",
	" \n",
	" # 1. Remove HTML tags\n",
	" text_only = BeautifulSoup(rawReview).get_text()\n",
	" #\n",
	" # 2. Remove Email IDs, URLs and numbers\n",
	" noEmail = re.sub(r'([\\w\\.-]+@[\\w\\.-]+\\.\\w+)','',text_only)\n",
	" \n",
	" noUrl = re.sub(r'(?i)\\b((?:[a-z][\\w-]+:(?:/{1,3}\|[a-z0-9%])\|www\\d{0,3}[.]\| \\\n",
	" [a-z0-9.\\-]+[.][a-z]{2,4}/\|[a-z0-9.\\-]+[.][a-z])(?:[^\\s()<>]+\|\\(([^\\s()<>]+\| \\\n",
	" (\\([^\\s()<>]+\\)))\\))+(?:\\(([^\\s()<>]+\|(\\([^\\s()<>]+\\)))\\)\|[^\\s`!()\\[\\]{};:\\'\".,<>?«»“”‘’]))','', noEmail)\n",
	" \n",
	" #Emotional symbols may affect the meaning of the review\n",
	" smileys = \"\"\":-) :) :o) :D :-D :( :-( :o(\"\"\".split()\n",
	" smileyPattern = \"\|\".join(map(re.escape, smileys))\n",
	" \n",
	" letters_only = re.sub(\"[^a-zA-Z\" + smileyPattern + \"]\", \" \", noUrl)\n",
	" #\n",
	" # 3. Convert to lower case and split into individual words\n",
	" words = letters_only.lower().split() \n",
	" #\n",
	" # 4. In Python, searching a set is much faster than searching a list, so convert the stop words to a set\n",
	" stops = set(stopwords.words(\"english\")) \n",
	" # \n",
	" # 5. Remove stop words and also 3-letter words and Lemmatize the review\n",
	" lemmatizer = WordNetLemmatizer()\n",
	" lemmatized_words = ''\n",
	" for word in words:\n",
	" if word not in stops and len(word) > 3:\n",
	" #if len(word) > 3:\n",
	" lemmatized_words += str(lemmatizer.lemmatize(word)) + ' '\n",
	" #\n",
	" # 6. Join the words back into one string separated by space and return the result.\n",
	" return lemmatized_words\n",
	"\n",
	"def createTFIDFMatrices(train_data, test_data):\n",
	" \"\"\"Takes in processed training and testing data, outputs respective L2-normalized sparse matrices with TF-IDF values\"\"\"\n",
	" \n",
	" vectorizer = TfidfVectorizer(norm = 'l2')\n",
	" \n",
	" train_matrix = vectorizer.fit_transform(train_data)\n",
	" \n",
	" #parameters generated from fit() method on train data applied upon model to generate transformed data set of test data\n",
	" test_matrix = vectorizer.transform(test_data)\n",
	"\n",
	" return train_matrix, test_matrix\n",
	"\n",
	"def findSimilarities(train_matrix, test_matrix):\n",
	" \"\"\"Takes in the entire training data and the testing data (both sparse matrices) and \n",
	" gives the cosine similarity between the two as a numpy array.\n",
	" Numpy arrays are fastest to work with for sorting while finding nearest neighbors\"\"\"\n",
	" \n",
	" cosineSimilarities = np.dot(test_matrix, np.transpose(train_matrix))\n",
	" similarities = cosineSimilarities.toarray()\n",
	" \n",
	" return similarities\n",
	"\n",
	"def findKNearest(similarity_vector, k):\n",
	" \"\"\"Takes in the similarity vector (numpy array) and number of neighbors to find, to return the K Nearest Neighbors indices.\n",
	" The input array gets sorted in descending order and the first k indices returned.\n",
	" The argsort function has been used to preserve the indices of the training reviews so that their respective labels\n",
	" can be easily referenced in the training labels list\"\"\"\n",
	" \n",
	" return np.argsort(-similarity_vector)[:k]\n",
	" \n",
	"\n",
	"def predict(nearestNeighbors, labels):\n",
	" \"\"\"Takes in the list of K nearest Neighbors and the full training labels list, and \n",
	" calculates the count of positive and negative reviews. \n",
	" If positive reviews are more, then the test review is positive and vice-versa\"\"\"\n",
	" \n",
	" positiveReviewsCount = 0\n",
	" negativeReviewsCount = 0\n",
	" for neighbor in nearestNeighbors:\n",
	" if int(labels[neighbor]) == 1:\n",
	" positiveReviewsCount += 1\n",
	" else:\n",
	" negativeReviewsCount += 1\n",
	" if positiveReviewsCount > negativeReviewsCount:\n",
	" return 1\n",
	" else:\n",
	" return -1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {},
	"outputs": [],
	"source": [
	"#Read the training and the test data set and get 3 separate lists of training reviews, test reviews and training labels\n",
	"train_reviews, test_reviews, train_sentiments = loadData('train.dat', 'test.dat')\n",
	"\n",
	"#Pre-process both the training and the test data set\n",
	"train_reviews = clean(train_reviews)\n",
	"test_reviews = clean(test_reviews)\n",
	"\n",
	"train_matrix, test_matrix = createTFIDFMatrices(train_reviews, test_reviews)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 102,
	"metadata": {},
	"outputs": [],
	"source": [
	"similarities = findSimilarities(train_matrix, test_matrix)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 116,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Pass every row in the numpy array of similarities to predict the sentiment of every review\n",
	"\n",
	"k = 300\n",
	"test_sentiments = list()\n",
	"\n",
	"for similarity in similarities:\n",
	" knn = findKNearest(similarity, k)\n",
	" prediction = predict(knn, train_sentiments)\n",
	" \n",
	" #To write to the list as +1 instead of just a 1 for positive reviews\n",
	" if prediction == 1:\n",
	" test_sentiments.append('+1')\n",
	" else:\n",
	" test_sentiments.append('-1')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Write the result to a .dat file\n",
	"output = open('output-k-300.dat', 'w')\n",
	"\n",
	"output.writelines( \"%s\\n\" % item for item in test_sentiments )\n",
	"\n",
	"output.close()"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}