ravigurnatham/3-vectorization.ipynb

## 3-vectorization.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.6.1"
    },
    "colab": {
      "name": "3.vectorization.ipynb",
      "provenance": [],
      "include_colab_link": true
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ravigurnatham/5179f256aed1b6f27eade572839891d5/3-vectorization.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YfDYuwsvS26a"
      },
      "source": [
        "<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4B-dMXJ9S26c"
      },
      "source": [
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import re\n",
        "import time\n",
        "import warnings\n",
        "import numpy as np\n",
        "from nltk.corpus import stopwords\n",
        "from sklearn.preprocessing import normalize\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "import sys\n",
        "import os \n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "from tqdm import tqdm\n"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "id": "tX6sJQFqS26e"
      },
      "source": [
        "# avoid decoding problems\n",
        "df = pd.read_csv(\"train.csv\")\n",
        "df['question1'] = df['question1'].apply(lambda x: str(x))\n",
        "df['question2'] = df['question2'].apply(lambda x: str(x))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pRoi7zUdS26e",
        "outputId": "43d4097f-7078-4e54-c28e-6a842c6706e4"
      },
      "source": [
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style>\n",
              "    .dataframe thead tr:only-child th {\n",
              "        text-align: right;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: left;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>qid1</th>\n",
              "      <th>qid2</th>\n",
              "      <th>question1</th>\n",
              "      <th>question2</th>\n",
              "      <th>is_duplicate</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>2</td>\n",
              "      <td>What is the step by step guide to invest in sh...</td>\n",
              "      <td>What is the step by step guide to invest in sh...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>3</td>\n",
              "      <td>4</td>\n",
              "      <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
              "      <td>What would happen if the Indian government sto...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>5</td>\n",
              "      <td>6</td>\n",
              "      <td>How can I increase the speed of my internet co...</td>\n",
              "      <td>How can Internet speed be increased by hacking...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>7</td>\n",
              "      <td>8</td>\n",
              "      <td>Why am I mentally very lonely? How can I solve...</td>\n",
              "      <td>Find the remainder when [math]23^{24}[/math] i...</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>9</td>\n",
              "      <td>10</td>\n",
              "      <td>Which one dissolve in water quikly sugar, salt...</td>\n",
              "      <td>Which fish would survive in salt water?</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   id  qid1  qid2                                          question1  \\\n",
              "0   0     1     2  What is the step by step guide to invest in sh...   \n",
              "1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   \n",
              "2   2     5     6  How can I increase the speed of my internet co...   \n",
              "3   3     7     8  Why am I mentally very lonely? How can I solve...   \n",
              "4   4     9    10  Which one dissolve in water quikly sugar, salt...   \n",
              "\n",
              "                                           question2  is_duplicate  \n",
              "0  What is the step by step guide to invest in sh...             0  \n",
              "1  What would happen if the Indian government sto...             0  \n",
              "2  How can Internet speed be increased by hacking...             0  \n",
              "3  Find the remainder when [math]23^{24}[/math] i...             0  \n",
              "4            Which fish would survive in salt water?             0  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "id": "8W9yl5B_S26f"
      },
      "source": [
        "from sklearn.feature_extraction.text import TfidfVectorizer\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "# merge texts\n",
        "questions = list(df['question1']) + list(df['question2'])\n",
        "\n",
        "tfidf = TfidfVectorizer(lowercase=False, )\n",
        "tfidf.fit_transform(questions)\n",
        "\n",
        "# dict key:word and value:tf-idf score\n",
        "word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "collapsed": true,
        "id": "HRb86OXfS26f"
      },
      "source": [
        "- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.\n",
        "- here we use a pre-trained GLOVE model which comes free with \"Spacy\".  https://spacy.io/usage/vectors-similarity\n",
        "- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "eVBNQy_QS26g",
        "outputId": "8dfbbab2-abd1-4fc2-8623-c8684aeed7d9"
      },
      "source": [
        "# en_vectors_web_lg, which includes over 1 million unique vectors.\n",
        "nlp = spacy.load('en_core_web_sm')\n",
        "\n",
        "vecs1 = []\n",
        "# tqdm is used to print the progress bar\n",
        "for qu1 in tqdm(list(df['question1'])):\n",
        "    doc1 = nlp(qu1) \n",
        "    # 384 is the number of dimensions of vectors \n",
        "    mean_vec1 = np.zeros([len(doc1), 384])\n",
        "    for word1 in doc1:\n",
        "        # word2vec\n",
        "        vec1 = word1.vector\n",
        "        # fetch df score\n",
        "        try:\n",
        "            idf = word2tfidf[str(word1)]\n",
        "        except:\n",
        "            idf = 0\n",
        "        # compute final vec\n",
        "        mean_vec1 += vec1 * idf\n",
        "    mean_vec1 = mean_vec1.mean(axis=0)\n",
        "    vecs1.append(mean_vec1)\n",
        "df['q1_feats_m'] = list(vecs1)\n"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [2:13:51<00:00, 50.34it/s]\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-OYMtpmuS26h",
        "outputId": "55b15299-1870-42dc-f89b-78052210dd6d"
      },
      "source": [
        "vecs2 = []\n",
        "for qu2 in tqdm(list(df['question2'])):\n",
        "    doc2 = nlp(qu2) \n",
        "    mean_vec2 = np.zeros([len(doc2), 384])\n",
        "    for word2 in doc2:\n",
        "        # word2vec\n",
        "        vec2 = word2.vector\n",
        "        # fetch df score\n",
        "        try:\n",
        "            idf = word2tfidf[str(word2)]\n",
        "        except:\n",
        "            #print word\n",
        "            idf = 0\n",
        "        # compute final vec\n",
        "        mean_vec2 += vec2 * idf\n",
        "    mean_vec2 = mean_vec2.mean(axis=0)\n",
        "    vecs2.append(mean_vec2)\n",
        "df['q2_feats_m'] = list(vecs2)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [1:47:52<00:00, 62.46it/s]\n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "id": "66CW0XN1S26i"
      },
      "source": [
        "#prepro_features_train.csv (Simple Preprocessing Feartures)\n",
        "#nlp_features_train.csv (NLP Features)\n",
        "if os.path.isfile('nlp_features_train.csv'):\n",
        "    dfnlp = pd.read_csv(\"nlp_features_train.csv\",encoding='latin-1')\n",
        "else:\n",
        "    print(\"download nlp_features_train.csv from drive or run previous notebook\")\n",
        "\n",
        "if os.path.isfile('df_fe_without_preprocessing_train.csv'):\n",
        "    dfppro = pd.read_csv(\"df_fe_without_preprocessing_train.csv\",encoding='latin-1')\n",
        "else:\n",
        "    print(\"download df_fe_without_preprocessing_train.csv from drive or run previous notebook\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "id": "w4Wc7VAjS26i"
      },
      "source": [
        "df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)\n",
        "df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)\n",
        "df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)\n",
        "df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)\n",
        "df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "aBcKroT0S26j",
        "outputId": "1728b17b-9e91-4cde-edab-76372ade97d1"
      },
      "source": [
        "# dataframe of nlp features\n",
        "df1.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style>\n",
              "    .dataframe thead tr:only-child th {\n",
              "        text-align: right;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: left;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>is_duplicate</th>\n",
              "      <th>cwc_min</th>\n",
              "      <th>cwc_max</th>\n",
              "      <th>csc_min</th>\n",
              "      <th>csc_max</th>\n",
              "      <th>ctc_min</th>\n",
              "      <th>ctc_max</th>\n",
              "      <th>last_word_eq</th>\n",
              "      <th>first_word_eq</th>\n",
              "      <th>abs_len_diff</th>\n",
              "      <th>mean_len</th>\n",
              "      <th>token_set_ratio</th>\n",
              "      <th>token_sort_ratio</th>\n",
              "      <th>fuzz_ratio</th>\n",
              "      <th>fuzz_partial_ratio</th>\n",
              "      <th>longest_substr_ratio</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>0</td>\n",
              "      <td>0.999980</td>\n",
              "      <td>0.833319</td>\n",
              "      <td>0.999983</td>\n",
              "      <td>0.999983</td>\n",
              "      <td>0.916659</td>\n",
              "      <td>0.785709</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>13.0</td>\n",
              "      <td>100</td>\n",
              "      <td>93</td>\n",
              "      <td>93</td>\n",
              "      <td>100</td>\n",
              "      <td>0.982759</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>0</td>\n",
              "      <td>0.799984</td>\n",
              "      <td>0.399996</td>\n",
              "      <td>0.749981</td>\n",
              "      <td>0.599988</td>\n",
              "      <td>0.699993</td>\n",
              "      <td>0.466664</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>5.0</td>\n",
              "      <td>12.5</td>\n",
              "      <td>86</td>\n",
              "      <td>63</td>\n",
              "      <td>66</td>\n",
              "      <td>75</td>\n",
              "      <td>0.596154</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "      <td>0.399992</td>\n",
              "      <td>0.333328</td>\n",
              "      <td>0.399992</td>\n",
              "      <td>0.249997</td>\n",
              "      <td>0.399996</td>\n",
              "      <td>0.285712</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>4.0</td>\n",
              "      <td>12.0</td>\n",
              "      <td>66</td>\n",
              "      <td>66</td>\n",
              "      <td>54</td>\n",
              "      <td>54</td>\n",
              "      <td>0.166667</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>0</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>12.0</td>\n",
              "      <td>36</td>\n",
              "      <td>36</td>\n",
              "      <td>35</td>\n",
              "      <td>40</td>\n",
              "      <td>0.039216</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>0</td>\n",
              "      <td>0.399992</td>\n",
              "      <td>0.199998</td>\n",
              "      <td>0.999950</td>\n",
              "      <td>0.666644</td>\n",
              "      <td>0.571420</td>\n",
              "      <td>0.307690</td>\n",
              "      <td>0.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>6.0</td>\n",
              "      <td>10.0</td>\n",
              "      <td>67</td>\n",
              "      <td>47</td>\n",
              "      <td>46</td>\n",
              "      <td>56</td>\n",
              "      <td>0.175000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   id  is_duplicate   cwc_min   cwc_max   csc_min   csc_max   ctc_min  \\\n",
              "0   0             0  0.999980  0.833319  0.999983  0.999983  0.916659   \n",
              "1   1             0  0.799984  0.399996  0.749981  0.599988  0.699993   \n",
              "2   2             0  0.399992  0.333328  0.399992  0.249997  0.399996   \n",
              "3   3             0  0.000000  0.000000  0.000000  0.000000  0.000000   \n",
              "4   4             0  0.399992  0.199998  0.999950  0.666644  0.571420   \n",
              "\n",
              "    ctc_max  last_word_eq  first_word_eq  abs_len_diff  mean_len  \\\n",
              "0  0.785709           0.0            1.0           2.0      13.0   \n",
              "1  0.466664           0.0            1.0           5.0      12.5   \n",
              "2  0.285712           0.0            1.0           4.0      12.0   \n",
              "3  0.000000           0.0            0.0           2.0      12.0   \n",
              "4  0.307690           0.0            1.0           6.0      10.0   \n",
              "\n",
              "   token_set_ratio  token_sort_ratio  fuzz_ratio  fuzz_partial_ratio  \\\n",
              "0              100                93          93                 100   \n",
              "1               86                63          66                  75   \n",
              "2               66                66          54                  54   \n",
              "3               36                36          35                  40   \n",
              "4               67                47          46                  56   \n",
              "\n",
              "   longest_substr_ratio  \n",
              "0              0.982759  \n",
              "1              0.596154  \n",
              "2              0.166667  \n",
              "3              0.039216  \n",
              "4              0.175000  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uiY5w5HfS26j",
        "outputId": "342f7234-939a-4f4c-9639-d6ff5a7140f8"
      },
      "source": [
        "# data before preprocessing \n",
        "df2.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style>\n",
              "    .dataframe thead tr:only-child th {\n",
              "        text-align: right;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: left;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>id</th>\n",
              "      <th>freq_qid1</th>\n",
              "      <th>freq_qid2</th>\n",
              "      <th>q1len</th>\n",
              "      <th>q2len</th>\n",
              "      <th>q1_n_words</th>\n",
              "      <th>q2_n_words</th>\n",
              "      <th>word_Common</th>\n",
              "      <th>word_Total</th>\n",
              "      <th>word_share</th>\n",
              "      <th>freq_q1+q2</th>\n",
              "      <th>freq_q1-q2</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>66</td>\n",
              "      <td>57</td>\n",
              "      <td>14</td>\n",
              "      <td>12</td>\n",
              "      <td>10.0</td>\n",
              "      <td>23.0</td>\n",
              "      <td>0.434783</td>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>4</td>\n",
              "      <td>1</td>\n",
              "      <td>51</td>\n",
              "      <td>88</td>\n",
              "      <td>8</td>\n",
              "      <td>13</td>\n",
              "      <td>4.0</td>\n",
              "      <td>20.0</td>\n",
              "      <td>0.200000</td>\n",
              "      <td>5</td>\n",
              "      <td>3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>73</td>\n",
              "      <td>59</td>\n",
              "      <td>14</td>\n",
              "      <td>10</td>\n",
              "      <td>4.0</td>\n",
              "      <td>24.0</td>\n",
              "      <td>0.166667</td>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>1</td>\n",
              "      <td>1</td>\n",
              "      <td>50</td>\n",
              "      <td>65</td>\n",
              "      <td>11</td>\n",
              "      <td>9</td>\n",
              "      <td>0.0</td>\n",
              "      <td>19.0</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>2</td>\n",
              "      <td>0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>3</td>\n",
              "      <td>1</td>\n",
              "      <td>76</td>\n",
              "      <td>39</td>\n",
              "      <td>13</td>\n",
              "      <td>7</td>\n",
              "      <td>2.0</td>\n",
              "      <td>20.0</td>\n",
              "      <td>0.100000</td>\n",
              "      <td>4</td>\n",
              "      <td>2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   id  freq_qid1  freq_qid2  q1len  q2len  q1_n_words  q2_n_words  \\\n",
              "0   0          1          1     66     57          14          12   \n",
              "1   1          4          1     51     88           8          13   \n",
              "2   2          1          1     73     59          14          10   \n",
              "3   3          1          1     50     65          11           9   \n",
              "4   4          3          1     76     39          13           7   \n",
              "\n",
              "   word_Common  word_Total  word_share  freq_q1+q2  freq_q1-q2  \n",
              "0         10.0        23.0    0.434783           2           0  \n",
              "1          4.0        20.0    0.200000           5           3  \n",
              "2          4.0        24.0    0.166667           2           0  \n",
              "3          0.0        19.0    0.000000           2           0  \n",
              "4          2.0        20.0    0.100000           4           2  "
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Y7dIx6trS26k",
        "outputId": "178fc912-ac51-4fb9-bf36-301f7d798b97"
      },
      "source": [
        "# Questions 1 tfidf weighted word2vec\n",
        "df3_q1.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style>\n",
              "    .dataframe thead tr:only-child th {\n",
              "        text-align: right;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: left;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>2</th>\n",
              "      <th>3</th>\n",
              "      <th>4</th>\n",
              "      <th>5</th>\n",
              "      <th>6</th>\n",
              "      <th>7</th>\n",
              "      <th>8</th>\n",
              "      <th>9</th>\n",
              "      <th>...</th>\n",
              "      <th>374</th>\n",
              "      <th>375</th>\n",
              "      <th>376</th>\n",
              "      <th>377</th>\n",
              "      <th>378</th>\n",
              "      <th>379</th>\n",
              "      <th>380</th>\n",
              "      <th>381</th>\n",
              "      <th>382</th>\n",
              "      <th>383</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>121.929927</td>\n",
              "      <td>100.083900</td>\n",
              "      <td>72.497894</td>\n",
              "      <td>115.641800</td>\n",
              "      <td>-48.370870</td>\n",
              "      <td>34.619058</td>\n",
              "      <td>-172.057787</td>\n",
              "      <td>-92.502617</td>\n",
              "      <td>113.223315</td>\n",
              "      <td>50.562441</td>\n",
              "      <td>...</td>\n",
              "      <td>12.397642</td>\n",
              "      <td>40.909519</td>\n",
              "      <td>8.150261</td>\n",
              "      <td>-15.170692</td>\n",
              "      <td>18.007709</td>\n",
              "      <td>6.166999</td>\n",
              "      <td>-30.124163</td>\n",
              "      <td>3.700902</td>\n",
              "      <td>-1.757693</td>\n",
              "      <td>-1.818058</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-78.070939</td>\n",
              "      <td>54.843781</td>\n",
              "      <td>82.738482</td>\n",
              "      <td>98.191872</td>\n",
              "      <td>-51.234859</td>\n",
              "      <td>55.013510</td>\n",
              "      <td>-39.140730</td>\n",
              "      <td>-82.692352</td>\n",
              "      <td>45.161489</td>\n",
              "      <td>-9.556289</td>\n",
              "      <td>...</td>\n",
              "      <td>-21.987077</td>\n",
              "      <td>-12.389279</td>\n",
              "      <td>20.667979</td>\n",
              "      <td>2.202714</td>\n",
              "      <td>-17.142454</td>\n",
              "      <td>-5.880972</td>\n",
              "      <td>-10.123963</td>\n",
              "      <td>-4.890663</td>\n",
              "      <td>-13.018389</td>\n",
              "      <td>-5.219310</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-5.355015</td>\n",
              "      <td>73.671810</td>\n",
              "      <td>14.376365</td>\n",
              "      <td>104.130241</td>\n",
              "      <td>1.433537</td>\n",
              "      <td>35.229116</td>\n",
              "      <td>-148.519385</td>\n",
              "      <td>-97.124595</td>\n",
              "      <td>41.972195</td>\n",
              "      <td>50.948731</td>\n",
              "      <td>...</td>\n",
              "      <td>3.027700</td>\n",
              "      <td>14.025767</td>\n",
              "      <td>-2.960312</td>\n",
              "      <td>-3.206544</td>\n",
              "      <td>4.355141</td>\n",
              "      <td>2.936152</td>\n",
              "      <td>-20.199555</td>\n",
              "      <td>9.816351</td>\n",
              "      <td>11.894366</td>\n",
              "      <td>-8.798819</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>5.778359</td>\n",
              "      <td>-34.712038</td>\n",
              "      <td>48.999631</td>\n",
              "      <td>59.699204</td>\n",
              "      <td>40.661263</td>\n",
              "      <td>-41.658731</td>\n",
              "      <td>-36.808594</td>\n",
              "      <td>24.170655</td>\n",
              "      <td>0.235600</td>\n",
              "      <td>-29.407290</td>\n",
              "      <td>...</td>\n",
              "      <td>13.100007</td>\n",
              "      <td>1.405670</td>\n",
              "      <td>-1.891076</td>\n",
              "      <td>-7.882638</td>\n",
              "      <td>18.000561</td>\n",
              "      <td>12.106918</td>\n",
              "      <td>-10.507835</td>\n",
              "      <td>5.243834</td>\n",
              "      <td>10.158340</td>\n",
              "      <td>5.886351</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>51.138220</td>\n",
              "      <td>38.587312</td>\n",
              "      <td>123.639488</td>\n",
              "      <td>53.333041</td>\n",
              "      <td>-47.062739</td>\n",
              "      <td>37.356212</td>\n",
              "      <td>-298.722753</td>\n",
              "      <td>-106.421119</td>\n",
              "      <td>106.248914</td>\n",
              "      <td>65.880707</td>\n",
              "      <td>...</td>\n",
              "      <td>13.906532</td>\n",
              "      <td>43.461721</td>\n",
              "      <td>11.519207</td>\n",
              "      <td>-22.468284</td>\n",
              "      <td>45.431128</td>\n",
              "      <td>8.161224</td>\n",
              "      <td>-35.373910</td>\n",
              "      <td>7.728865</td>\n",
              "      <td>9.592849</td>\n",
              "      <td>5.447336</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>5 rows × 384 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "          0           1           2           3          4          5    \\\n",
              "0  121.929927  100.083900   72.497894  115.641800 -48.370870  34.619058   \n",
              "1  -78.070939   54.843781   82.738482   98.191872 -51.234859  55.013510   \n",
              "2   -5.355015   73.671810   14.376365  104.130241   1.433537  35.229116   \n",
              "3    5.778359  -34.712038   48.999631   59.699204  40.661263 -41.658731   \n",
              "4   51.138220   38.587312  123.639488   53.333041 -47.062739  37.356212   \n",
              "\n",
              "          6           7           8          9      ...           374  \\\n",
              "0 -172.057787  -92.502617  113.223315  50.562441    ...     12.397642   \n",
              "1  -39.140730  -82.692352   45.161489  -9.556289    ...    -21.987077   \n",
              "2 -148.519385  -97.124595   41.972195  50.948731    ...      3.027700   \n",
              "3  -36.808594   24.170655    0.235600 -29.407290    ...     13.100007   \n",
              "4 -298.722753 -106.421119  106.248914  65.880707    ...     13.906532   \n",
              "\n",
              "         375        376        377        378        379        380       381  \\\n",
              "0  40.909519   8.150261 -15.170692  18.007709   6.166999 -30.124163  3.700902   \n",
              "1 -12.389279  20.667979   2.202714 -17.142454  -5.880972 -10.123963 -4.890663   \n",
              "2  14.025767  -2.960312  -3.206544   4.355141   2.936152 -20.199555  9.816351   \n",
              "3   1.405670  -1.891076  -7.882638  18.000561  12.106918 -10.507835  5.243834   \n",
              "4  43.461721  11.519207 -22.468284  45.431128   8.161224 -35.373910  7.728865   \n",
              "\n",
              "         382       383  \n",
              "0  -1.757693 -1.818058  \n",
              "1 -13.018389 -5.219310  \n",
              "2  11.894366 -8.798819  \n",
              "3  10.158340  5.886351  \n",
              "4   9.592849  5.447336  \n",
              "\n",
              "[5 rows x 384 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_NTh_yUgS26k",
        "outputId": "511992d8-1acb-4df7-b616-09afdfb979c2"
      },
      "source": [
        "# Questions 2 tfidf weighted word2vec\n",
        "df3_q2.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style>\n",
              "    .dataframe thead tr:only-child th {\n",
              "        text-align: right;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: left;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>2</th>\n",
              "      <th>3</th>\n",
              "      <th>4</th>\n",
              "      <th>5</th>\n",
              "      <th>6</th>\n",
              "      <th>7</th>\n",
              "      <th>8</th>\n",
              "      <th>9</th>\n",
              "      <th>...</th>\n",
              "      <th>374</th>\n",
              "      <th>375</th>\n",
              "      <th>376</th>\n",
              "      <th>377</th>\n",
              "      <th>378</th>\n",
              "      <th>379</th>\n",
              "      <th>380</th>\n",
              "      <th>381</th>\n",
              "      <th>382</th>\n",
              "      <th>383</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>125.983301</td>\n",
              "      <td>95.636485</td>\n",
              "      <td>42.114702</td>\n",
              "      <td>95.449980</td>\n",
              "      <td>-37.386295</td>\n",
              "      <td>39.400078</td>\n",
              "      <td>-148.116070</td>\n",
              "      <td>-87.851475</td>\n",
              "      <td>110.371966</td>\n",
              "      <td>62.272814</td>\n",
              "      <td>...</td>\n",
              "      <td>16.165592</td>\n",
              "      <td>33.030668</td>\n",
              "      <td>7.019996</td>\n",
              "      <td>-14.793959</td>\n",
              "      <td>15.437511</td>\n",
              "      <td>8.199658</td>\n",
              "      <td>-25.070834</td>\n",
              "      <td>1.571619</td>\n",
              "      <td>1.603738</td>\n",
              "      <td>0.305645</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-106.871904</td>\n",
              "      <td>80.290331</td>\n",
              "      <td>79.066297</td>\n",
              "      <td>59.302092</td>\n",
              "      <td>-42.175328</td>\n",
              "      <td>117.616655</td>\n",
              "      <td>-144.364237</td>\n",
              "      <td>-127.131513</td>\n",
              "      <td>22.962533</td>\n",
              "      <td>25.397575</td>\n",
              "      <td>...</td>\n",
              "      <td>-4.901128</td>\n",
              "      <td>-4.565393</td>\n",
              "      <td>41.520751</td>\n",
              "      <td>-0.727564</td>\n",
              "      <td>-16.413776</td>\n",
              "      <td>-7.373778</td>\n",
              "      <td>2.638877</td>\n",
              "      <td>-7.403457</td>\n",
              "      <td>2.703070</td>\n",
              "      <td>0.408040</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>7.072875</td>\n",
              "      <td>15.513378</td>\n",
              "      <td>1.846914</td>\n",
              "      <td>85.937583</td>\n",
              "      <td>-33.808811</td>\n",
              "      <td>94.702337</td>\n",
              "      <td>-122.256856</td>\n",
              "      <td>-114.009530</td>\n",
              "      <td>53.922293</td>\n",
              "      <td>60.131814</td>\n",
              "      <td>...</td>\n",
              "      <td>8.359966</td>\n",
              "      <td>-2.165985</td>\n",
              "      <td>10.936580</td>\n",
              "      <td>-16.531660</td>\n",
              "      <td>14.681230</td>\n",
              "      <td>15.633759</td>\n",
              "      <td>-1.210901</td>\n",
              "      <td>14.183826</td>\n",
              "      <td>11.703135</td>\n",
              "      <td>10.148075</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>39.421531</td>\n",
              "      <td>44.136989</td>\n",
              "      <td>-24.010929</td>\n",
              "      <td>85.265863</td>\n",
              "      <td>-0.339022</td>\n",
              "      <td>-9.323137</td>\n",
              "      <td>-60.499651</td>\n",
              "      <td>-37.044763</td>\n",
              "      <td>49.407848</td>\n",
              "      <td>-23.350150</td>\n",
              "      <td>...</td>\n",
              "      <td>3.311411</td>\n",
              "      <td>3.788879</td>\n",
              "      <td>13.398598</td>\n",
              "      <td>-6.592596</td>\n",
              "      <td>6.437365</td>\n",
              "      <td>5.993293</td>\n",
              "      <td>2.732392</td>\n",
              "      <td>-3.727647</td>\n",
              "      <td>5.614115</td>\n",
              "      <td>6.023693</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>31.950101</td>\n",
              "      <td>62.854106</td>\n",
              "      <td>1.778164</td>\n",
              "      <td>36.218768</td>\n",
              "      <td>-45.130875</td>\n",
              "      <td>66.674880</td>\n",
              "      <td>-106.342341</td>\n",
              "      <td>-22.901008</td>\n",
              "      <td>59.835938</td>\n",
              "      <td>62.663961</td>\n",
              "      <td>...</td>\n",
              "      <td>-2.403870</td>\n",
              "      <td>11.991204</td>\n",
              "      <td>8.088483</td>\n",
              "      <td>-15.090201</td>\n",
              "      <td>8.375166</td>\n",
              "      <td>1.727225</td>\n",
              "      <td>-6.601129</td>\n",
              "      <td>11.317413</td>\n",
              "      <td>11.544603</td>\n",
              "      <td>2.478689</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>5 rows × 384 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "          0          1          2          3          4           5    \\\n",
              "0  125.983301  95.636485  42.114702  95.449980 -37.386295   39.400078   \n",
              "1 -106.871904  80.290331  79.066297  59.302092 -42.175328  117.616655   \n",
              "2    7.072875  15.513378   1.846914  85.937583 -33.808811   94.702337   \n",
              "3   39.421531  44.136989 -24.010929  85.265863  -0.339022   -9.323137   \n",
              "4   31.950101  62.854106   1.778164  36.218768 -45.130875   66.674880   \n",
              "\n",
              "          6           7           8          9      ...            374  \\\n",
              "0 -148.116070  -87.851475  110.371966  62.272814    ...      16.165592   \n",
              "1 -144.364237 -127.131513   22.962533  25.397575    ...      -4.901128   \n",
              "2 -122.256856 -114.009530   53.922293  60.131814    ...       8.359966   \n",
              "3  -60.499651  -37.044763   49.407848 -23.350150    ...       3.311411   \n",
              "4 -106.342341  -22.901008   59.835938  62.663961    ...      -2.403870   \n",
              "\n",
              "         375        376        377        378        379        380  \\\n",
              "0  33.030668   7.019996 -14.793959  15.437511   8.199658 -25.070834   \n",
              "1  -4.565393  41.520751  -0.727564 -16.413776  -7.373778   2.638877   \n",
              "2  -2.165985  10.936580 -16.531660  14.681230  15.633759  -1.210901   \n",
              "3   3.788879  13.398598  -6.592596   6.437365   5.993293   2.732392   \n",
              "4  11.991204   8.088483 -15.090201   8.375166   1.727225  -6.601129   \n",
              "\n",
              "         381        382        383  \n",
              "0   1.571619   1.603738   0.305645  \n",
              "1  -7.403457   2.703070   0.408040  \n",
              "2  14.183826  11.703135  10.148075  \n",
              "3  -3.727647   5.614115   6.023693  \n",
              "4  11.317413  11.544603   2.478689  \n",
              "\n",
              "[5 rows x 384 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "oTvS35ewS26k",
        "outputId": "edddde5a-6130-4ba1-c91e-27677afc50dd"
      },
      "source": [
        "print(\"Number of features in nlp dataframe :\", df1.shape[1])\n",
        "print(\"Number of features in preprocessed dataframe :\", df2.shape[1])\n",
        "print(\"Number of features in question1 w2v  dataframe :\", df3_q1.shape[1])\n",
        "print(\"Number of features in question2 w2v  dataframe :\", df3_q2.shape[1])\n",
        "print(\"Number of features in final dataframe  :\", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Number of features in nlp dataframe : 17\n",
            "Number of features in preprocessed dataframe : 12\n",
            "Number of features in question1 w2v  dataframe : 384\n",
            "Number of features in question2 w2v  dataframe : 384\n",
            "Number of features in final dataframe  : 794\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "collapsed": true,
        "id": "MEWzJdI7S26l"
      },
      "source": [
        "# storing the final features to csv file\n",
        "if not os.path.isfile('final_features.csv'):\n",
        "    df3_q1['id']=df1['id']\n",
        "    df3_q2['id']=df1['id']\n",
        "    df1  = df1.merge(df2, on='id',how='left')\n",
        "    df2  = df3_q1.merge(df3_q2, on='id',how='left')\n",
        "    result  = df1.merge(df2, on='id',how='left')\n",
        "    result.to_csv('final_features.csv')"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}