Skip to content

Instantly share code, notes, and snippets.

@ravigurnatham
Last active December 30, 2020 03:25
Show Gist options
  • Save ravigurnatham/5179f256aed1b6f27eade572839891d5 to your computer and use it in GitHub Desktop.
Save ravigurnatham/5179f256aed1b6f27eade572839891d5 to your computer and use it in GitHub Desktop.
3.vectorization.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
},
"colab": {
"name": "3.vectorization.ipynb",
"provenance": [],
"include_colab_link": true
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ravigurnatham/5179f256aed1b6f27eade572839891d5/3-vectorization.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "YfDYuwsvS26a"
},
"source": [
"<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "4B-dMXJ9S26c"
},
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import re\n",
"import time\n",
"import warnings\n",
"import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from sklearn.preprocessing import normalize\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"warnings.filterwarnings(\"ignore\")\n",
"import sys\n",
"import os \n",
"import pandas as pd\n",
"import numpy as np\n",
"from tqdm import tqdm\n"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "tX6sJQFqS26e"
},
"source": [
"# avoid decoding problems\n",
"df = pd.read_csv(\"train.csv\")\n",
"df['question1'] = df['question1'].apply(lambda x: str(x))\n",
"df['question2'] = df['question2'].apply(lambda x: str(x))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "pRoi7zUdS26e",
"outputId": "43d4097f-7078-4e54-c28e-6a842c6706e4"
},
"source": [
"df.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>qid1</th>\n",
" <th>qid2</th>\n",
" <th>question1</th>\n",
" <th>question2</th>\n",
" <th>is_duplicate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>What is the step by step guide to invest in sh...</td>\n",
" <td>What is the step by step guide to invest in sh...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
" <td>What would happen if the Indian government sto...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>6</td>\n",
" <td>How can I increase the speed of my internet co...</td>\n",
" <td>How can Internet speed be increased by hacking...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
" <td>Find the remainder when [math]23^{24}[/math] i...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>9</td>\n",
" <td>10</td>\n",
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
" <td>Which fish would survive in salt water?</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id qid1 qid2 question1 \\\n",
"0 0 1 2 What is the step by step guide to invest in sh... \n",
"1 1 3 4 What is the story of Kohinoor (Koh-i-Noor) Dia... \n",
"2 2 5 6 How can I increase the speed of my internet co... \n",
"3 3 7 8 Why am I mentally very lonely? How can I solve... \n",
"4 4 9 10 Which one dissolve in water quikly sugar, salt... \n",
"\n",
" question2 is_duplicate \n",
"0 What is the step by step guide to invest in sh... 0 \n",
"1 What would happen if the Indian government sto... 0 \n",
"2 How can Internet speed be increased by hacking... 0 \n",
"3 Find the remainder when [math]23^{24}[/math] i... 0 \n",
"4 Which fish would survive in salt water? 0 "
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "8W9yl5B_S26f"
},
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"# merge texts\n",
"questions = list(df['question1']) + list(df['question2'])\n",
"\n",
"tfidf = TfidfVectorizer(lowercase=False, )\n",
"tfidf.fit_transform(questions)\n",
"\n",
"# dict key:word and value:tf-idf score\n",
"word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true,
"id": "HRb86OXfS26f"
},
"source": [
"- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.\n",
"- here we use a pre-trained GLOVE model which comes free with \"Spacy\". https://spacy.io/usage/vectors-similarity\n",
"- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. "
]
},
{
"cell_type": "code",
"metadata": {
"id": "eVBNQy_QS26g",
"outputId": "8dfbbab2-abd1-4fc2-8623-c8684aeed7d9"
},
"source": [
"# en_vectors_web_lg, which includes over 1 million unique vectors.\n",
"nlp = spacy.load('en_core_web_sm')\n",
"\n",
"vecs1 = []\n",
"# tqdm is used to print the progress bar\n",
"for qu1 in tqdm(list(df['question1'])):\n",
" doc1 = nlp(qu1) \n",
" # 384 is the number of dimensions of vectors \n",
" mean_vec1 = np.zeros([len(doc1), 384])\n",
" for word1 in doc1:\n",
" # word2vec\n",
" vec1 = word1.vector\n",
" # fetch df score\n",
" try:\n",
" idf = word2tfidf[str(word1)]\n",
" except:\n",
" idf = 0\n",
" # compute final vec\n",
" mean_vec1 += vec1 * idf\n",
" mean_vec1 = mean_vec1.mean(axis=0)\n",
" vecs1.append(mean_vec1)\n",
"df['q1_feats_m'] = list(vecs1)\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [2:13:51<00:00, 50.34it/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "-OYMtpmuS26h",
"outputId": "55b15299-1870-42dc-f89b-78052210dd6d"
},
"source": [
"vecs2 = []\n",
"for qu2 in tqdm(list(df['question2'])):\n",
" doc2 = nlp(qu2) \n",
" mean_vec2 = np.zeros([len(doc2), 384])\n",
" for word2 in doc2:\n",
" # word2vec\n",
" vec2 = word2.vector\n",
" # fetch df score\n",
" try:\n",
" idf = word2tfidf[str(word2)]\n",
" except:\n",
" #print word\n",
" idf = 0\n",
" # compute final vec\n",
" mean_vec2 += vec2 * idf\n",
" mean_vec2 = mean_vec2.mean(axis=0)\n",
" vecs2.append(mean_vec2)\n",
"df['q2_feats_m'] = list(vecs2)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████| 404290/404290 [1:47:52<00:00, 62.46it/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "66CW0XN1S26i"
},
"source": [
"#prepro_features_train.csv (Simple Preprocessing Feartures)\n",
"#nlp_features_train.csv (NLP Features)\n",
"if os.path.isfile('nlp_features_train.csv'):\n",
" dfnlp = pd.read_csv(\"nlp_features_train.csv\",encoding='latin-1')\n",
"else:\n",
" print(\"download nlp_features_train.csv from drive or run previous notebook\")\n",
"\n",
"if os.path.isfile('df_fe_without_preprocessing_train.csv'):\n",
" dfppro = pd.read_csv(\"df_fe_without_preprocessing_train.csv\",encoding='latin-1')\n",
"else:\n",
" print(\"download df_fe_without_preprocessing_train.csv from drive or run previous notebook\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "w4Wc7VAjS26i"
},
"source": [
"df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)\n",
"df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)\n",
"df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)\n",
"df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)\n",
"df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "aBcKroT0S26j",
"outputId": "1728b17b-9e91-4cde-edab-76372ade97d1"
},
"source": [
"# dataframe of nlp features\n",
"df1.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>is_duplicate</th>\n",
" <th>cwc_min</th>\n",
" <th>cwc_max</th>\n",
" <th>csc_min</th>\n",
" <th>csc_max</th>\n",
" <th>ctc_min</th>\n",
" <th>ctc_max</th>\n",
" <th>last_word_eq</th>\n",
" <th>first_word_eq</th>\n",
" <th>abs_len_diff</th>\n",
" <th>mean_len</th>\n",
" <th>token_set_ratio</th>\n",
" <th>token_sort_ratio</th>\n",
" <th>fuzz_ratio</th>\n",
" <th>fuzz_partial_ratio</th>\n",
" <th>longest_substr_ratio</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.999980</td>\n",
" <td>0.833319</td>\n",
" <td>0.999983</td>\n",
" <td>0.999983</td>\n",
" <td>0.916659</td>\n",
" <td>0.785709</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>13.0</td>\n",
" <td>100</td>\n",
" <td>93</td>\n",
" <td>93</td>\n",
" <td>100</td>\n",
" <td>0.982759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.799984</td>\n",
" <td>0.399996</td>\n",
" <td>0.749981</td>\n",
" <td>0.599988</td>\n",
" <td>0.699993</td>\n",
" <td>0.466664</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>5.0</td>\n",
" <td>12.5</td>\n",
" <td>86</td>\n",
" <td>63</td>\n",
" <td>66</td>\n",
" <td>75</td>\n",
" <td>0.596154</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0.399992</td>\n",
" <td>0.333328</td>\n",
" <td>0.399992</td>\n",
" <td>0.249997</td>\n",
" <td>0.399996</td>\n",
" <td>0.285712</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>4.0</td>\n",
" <td>12.0</td>\n",
" <td>66</td>\n",
" <td>66</td>\n",
" <td>54</td>\n",
" <td>54</td>\n",
" <td>0.166667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>12.0</td>\n",
" <td>36</td>\n",
" <td>36</td>\n",
" <td>35</td>\n",
" <td>40</td>\n",
" <td>0.039216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0.399992</td>\n",
" <td>0.199998</td>\n",
" <td>0.999950</td>\n",
" <td>0.666644</td>\n",
" <td>0.571420</td>\n",
" <td>0.307690</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>6.0</td>\n",
" <td>10.0</td>\n",
" <td>67</td>\n",
" <td>47</td>\n",
" <td>46</td>\n",
" <td>56</td>\n",
" <td>0.175000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id is_duplicate cwc_min cwc_max csc_min csc_max ctc_min \\\n",
"0 0 0 0.999980 0.833319 0.999983 0.999983 0.916659 \n",
"1 1 0 0.799984 0.399996 0.749981 0.599988 0.699993 \n",
"2 2 0 0.399992 0.333328 0.399992 0.249997 0.399996 \n",
"3 3 0 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"4 4 0 0.399992 0.199998 0.999950 0.666644 0.571420 \n",
"\n",
" ctc_max last_word_eq first_word_eq abs_len_diff mean_len \\\n",
"0 0.785709 0.0 1.0 2.0 13.0 \n",
"1 0.466664 0.0 1.0 5.0 12.5 \n",
"2 0.285712 0.0 1.0 4.0 12.0 \n",
"3 0.000000 0.0 0.0 2.0 12.0 \n",
"4 0.307690 0.0 1.0 6.0 10.0 \n",
"\n",
" token_set_ratio token_sort_ratio fuzz_ratio fuzz_partial_ratio \\\n",
"0 100 93 93 100 \n",
"1 86 63 66 75 \n",
"2 66 66 54 54 \n",
"3 36 36 35 40 \n",
"4 67 47 46 56 \n",
"\n",
" longest_substr_ratio \n",
"0 0.982759 \n",
"1 0.596154 \n",
"2 0.166667 \n",
"3 0.039216 \n",
"4 0.175000 "
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uiY5w5HfS26j",
"outputId": "342f7234-939a-4f4c-9639-d6ff5a7140f8"
},
"source": [
"# data before preprocessing \n",
"df2.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>freq_qid1</th>\n",
" <th>freq_qid2</th>\n",
" <th>q1len</th>\n",
" <th>q2len</th>\n",
" <th>q1_n_words</th>\n",
" <th>q2_n_words</th>\n",
" <th>word_Common</th>\n",
" <th>word_Total</th>\n",
" <th>word_share</th>\n",
" <th>freq_q1+q2</th>\n",
" <th>freq_q1-q2</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>66</td>\n",
" <td>57</td>\n",
" <td>14</td>\n",
" <td>12</td>\n",
" <td>10.0</td>\n",
" <td>23.0</td>\n",
" <td>0.434783</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>51</td>\n",
" <td>88</td>\n",
" <td>8</td>\n",
" <td>13</td>\n",
" <td>4.0</td>\n",
" <td>20.0</td>\n",
" <td>0.200000</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>73</td>\n",
" <td>59</td>\n",
" <td>14</td>\n",
" <td>10</td>\n",
" <td>4.0</td>\n",
" <td>24.0</td>\n",
" <td>0.166667</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" <td>65</td>\n",
" <td>11</td>\n",
" <td>9</td>\n",
" <td>0.0</td>\n",
" <td>19.0</td>\n",
" <td>0.000000</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>76</td>\n",
" <td>39</td>\n",
" <td>13</td>\n",
" <td>7</td>\n",
" <td>2.0</td>\n",
" <td>20.0</td>\n",
" <td>0.100000</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" id freq_qid1 freq_qid2 q1len q2len q1_n_words q2_n_words \\\n",
"0 0 1 1 66 57 14 12 \n",
"1 1 4 1 51 88 8 13 \n",
"2 2 1 1 73 59 14 10 \n",
"3 3 1 1 50 65 11 9 \n",
"4 4 3 1 76 39 13 7 \n",
"\n",
" word_Common word_Total word_share freq_q1+q2 freq_q1-q2 \n",
"0 10.0 23.0 0.434783 2 0 \n",
"1 4.0 20.0 0.200000 5 3 \n",
"2 4.0 24.0 0.166667 2 0 \n",
"3 0.0 19.0 0.000000 2 0 \n",
"4 2.0 20.0 0.100000 4 2 "
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Y7dIx6trS26k",
"outputId": "178fc912-ac51-4fb9-bf36-301f7d798b97"
},
"source": [
"# Questions 1 tfidf weighted word2vec\n",
"df3_q1.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>374</th>\n",
" <th>375</th>\n",
" <th>376</th>\n",
" <th>377</th>\n",
" <th>378</th>\n",
" <th>379</th>\n",
" <th>380</th>\n",
" <th>381</th>\n",
" <th>382</th>\n",
" <th>383</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>121.929927</td>\n",
" <td>100.083900</td>\n",
" <td>72.497894</td>\n",
" <td>115.641800</td>\n",
" <td>-48.370870</td>\n",
" <td>34.619058</td>\n",
" <td>-172.057787</td>\n",
" <td>-92.502617</td>\n",
" <td>113.223315</td>\n",
" <td>50.562441</td>\n",
" <td>...</td>\n",
" <td>12.397642</td>\n",
" <td>40.909519</td>\n",
" <td>8.150261</td>\n",
" <td>-15.170692</td>\n",
" <td>18.007709</td>\n",
" <td>6.166999</td>\n",
" <td>-30.124163</td>\n",
" <td>3.700902</td>\n",
" <td>-1.757693</td>\n",
" <td>-1.818058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-78.070939</td>\n",
" <td>54.843781</td>\n",
" <td>82.738482</td>\n",
" <td>98.191872</td>\n",
" <td>-51.234859</td>\n",
" <td>55.013510</td>\n",
" <td>-39.140730</td>\n",
" <td>-82.692352</td>\n",
" <td>45.161489</td>\n",
" <td>-9.556289</td>\n",
" <td>...</td>\n",
" <td>-21.987077</td>\n",
" <td>-12.389279</td>\n",
" <td>20.667979</td>\n",
" <td>2.202714</td>\n",
" <td>-17.142454</td>\n",
" <td>-5.880972</td>\n",
" <td>-10.123963</td>\n",
" <td>-4.890663</td>\n",
" <td>-13.018389</td>\n",
" <td>-5.219310</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-5.355015</td>\n",
" <td>73.671810</td>\n",
" <td>14.376365</td>\n",
" <td>104.130241</td>\n",
" <td>1.433537</td>\n",
" <td>35.229116</td>\n",
" <td>-148.519385</td>\n",
" <td>-97.124595</td>\n",
" <td>41.972195</td>\n",
" <td>50.948731</td>\n",
" <td>...</td>\n",
" <td>3.027700</td>\n",
" <td>14.025767</td>\n",
" <td>-2.960312</td>\n",
" <td>-3.206544</td>\n",
" <td>4.355141</td>\n",
" <td>2.936152</td>\n",
" <td>-20.199555</td>\n",
" <td>9.816351</td>\n",
" <td>11.894366</td>\n",
" <td>-8.798819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>5.778359</td>\n",
" <td>-34.712038</td>\n",
" <td>48.999631</td>\n",
" <td>59.699204</td>\n",
" <td>40.661263</td>\n",
" <td>-41.658731</td>\n",
" <td>-36.808594</td>\n",
" <td>24.170655</td>\n",
" <td>0.235600</td>\n",
" <td>-29.407290</td>\n",
" <td>...</td>\n",
" <td>13.100007</td>\n",
" <td>1.405670</td>\n",
" <td>-1.891076</td>\n",
" <td>-7.882638</td>\n",
" <td>18.000561</td>\n",
" <td>12.106918</td>\n",
" <td>-10.507835</td>\n",
" <td>5.243834</td>\n",
" <td>10.158340</td>\n",
" <td>5.886351</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>51.138220</td>\n",
" <td>38.587312</td>\n",
" <td>123.639488</td>\n",
" <td>53.333041</td>\n",
" <td>-47.062739</td>\n",
" <td>37.356212</td>\n",
" <td>-298.722753</td>\n",
" <td>-106.421119</td>\n",
" <td>106.248914</td>\n",
" <td>65.880707</td>\n",
" <td>...</td>\n",
" <td>13.906532</td>\n",
" <td>43.461721</td>\n",
" <td>11.519207</td>\n",
" <td>-22.468284</td>\n",
" <td>45.431128</td>\n",
" <td>8.161224</td>\n",
" <td>-35.373910</td>\n",
" <td>7.728865</td>\n",
" <td>9.592849</td>\n",
" <td>5.447336</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 384 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 \\\n",
"0 121.929927 100.083900 72.497894 115.641800 -48.370870 34.619058 \n",
"1 -78.070939 54.843781 82.738482 98.191872 -51.234859 55.013510 \n",
"2 -5.355015 73.671810 14.376365 104.130241 1.433537 35.229116 \n",
"3 5.778359 -34.712038 48.999631 59.699204 40.661263 -41.658731 \n",
"4 51.138220 38.587312 123.639488 53.333041 -47.062739 37.356212 \n",
"\n",
" 6 7 8 9 ... 374 \\\n",
"0 -172.057787 -92.502617 113.223315 50.562441 ... 12.397642 \n",
"1 -39.140730 -82.692352 45.161489 -9.556289 ... -21.987077 \n",
"2 -148.519385 -97.124595 41.972195 50.948731 ... 3.027700 \n",
"3 -36.808594 24.170655 0.235600 -29.407290 ... 13.100007 \n",
"4 -298.722753 -106.421119 106.248914 65.880707 ... 13.906532 \n",
"\n",
" 375 376 377 378 379 380 381 \\\n",
"0 40.909519 8.150261 -15.170692 18.007709 6.166999 -30.124163 3.700902 \n",
"1 -12.389279 20.667979 2.202714 -17.142454 -5.880972 -10.123963 -4.890663 \n",
"2 14.025767 -2.960312 -3.206544 4.355141 2.936152 -20.199555 9.816351 \n",
"3 1.405670 -1.891076 -7.882638 18.000561 12.106918 -10.507835 5.243834 \n",
"4 43.461721 11.519207 -22.468284 45.431128 8.161224 -35.373910 7.728865 \n",
"\n",
" 382 383 \n",
"0 -1.757693 -1.818058 \n",
"1 -13.018389 -5.219310 \n",
"2 11.894366 -8.798819 \n",
"3 10.158340 5.886351 \n",
"4 9.592849 5.447336 \n",
"\n",
"[5 rows x 384 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "_NTh_yUgS26k",
"outputId": "511992d8-1acb-4df7-b616-09afdfb979c2"
},
"source": [
"# Questions 2 tfidf weighted word2vec\n",
"df3_q2.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>8</th>\n",
" <th>9</th>\n",
" <th>...</th>\n",
" <th>374</th>\n",
" <th>375</th>\n",
" <th>376</th>\n",
" <th>377</th>\n",
" <th>378</th>\n",
" <th>379</th>\n",
" <th>380</th>\n",
" <th>381</th>\n",
" <th>382</th>\n",
" <th>383</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>125.983301</td>\n",
" <td>95.636485</td>\n",
" <td>42.114702</td>\n",
" <td>95.449980</td>\n",
" <td>-37.386295</td>\n",
" <td>39.400078</td>\n",
" <td>-148.116070</td>\n",
" <td>-87.851475</td>\n",
" <td>110.371966</td>\n",
" <td>62.272814</td>\n",
" <td>...</td>\n",
" <td>16.165592</td>\n",
" <td>33.030668</td>\n",
" <td>7.019996</td>\n",
" <td>-14.793959</td>\n",
" <td>15.437511</td>\n",
" <td>8.199658</td>\n",
" <td>-25.070834</td>\n",
" <td>1.571619</td>\n",
" <td>1.603738</td>\n",
" <td>0.305645</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-106.871904</td>\n",
" <td>80.290331</td>\n",
" <td>79.066297</td>\n",
" <td>59.302092</td>\n",
" <td>-42.175328</td>\n",
" <td>117.616655</td>\n",
" <td>-144.364237</td>\n",
" <td>-127.131513</td>\n",
" <td>22.962533</td>\n",
" <td>25.397575</td>\n",
" <td>...</td>\n",
" <td>-4.901128</td>\n",
" <td>-4.565393</td>\n",
" <td>41.520751</td>\n",
" <td>-0.727564</td>\n",
" <td>-16.413776</td>\n",
" <td>-7.373778</td>\n",
" <td>2.638877</td>\n",
" <td>-7.403457</td>\n",
" <td>2.703070</td>\n",
" <td>0.408040</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.072875</td>\n",
" <td>15.513378</td>\n",
" <td>1.846914</td>\n",
" <td>85.937583</td>\n",
" <td>-33.808811</td>\n",
" <td>94.702337</td>\n",
" <td>-122.256856</td>\n",
" <td>-114.009530</td>\n",
" <td>53.922293</td>\n",
" <td>60.131814</td>\n",
" <td>...</td>\n",
" <td>8.359966</td>\n",
" <td>-2.165985</td>\n",
" <td>10.936580</td>\n",
" <td>-16.531660</td>\n",
" <td>14.681230</td>\n",
" <td>15.633759</td>\n",
" <td>-1.210901</td>\n",
" <td>14.183826</td>\n",
" <td>11.703135</td>\n",
" <td>10.148075</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>39.421531</td>\n",
" <td>44.136989</td>\n",
" <td>-24.010929</td>\n",
" <td>85.265863</td>\n",
" <td>-0.339022</td>\n",
" <td>-9.323137</td>\n",
" <td>-60.499651</td>\n",
" <td>-37.044763</td>\n",
" <td>49.407848</td>\n",
" <td>-23.350150</td>\n",
" <td>...</td>\n",
" <td>3.311411</td>\n",
" <td>3.788879</td>\n",
" <td>13.398598</td>\n",
" <td>-6.592596</td>\n",
" <td>6.437365</td>\n",
" <td>5.993293</td>\n",
" <td>2.732392</td>\n",
" <td>-3.727647</td>\n",
" <td>5.614115</td>\n",
" <td>6.023693</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>31.950101</td>\n",
" <td>62.854106</td>\n",
" <td>1.778164</td>\n",
" <td>36.218768</td>\n",
" <td>-45.130875</td>\n",
" <td>66.674880</td>\n",
" <td>-106.342341</td>\n",
" <td>-22.901008</td>\n",
" <td>59.835938</td>\n",
" <td>62.663961</td>\n",
" <td>...</td>\n",
" <td>-2.403870</td>\n",
" <td>11.991204</td>\n",
" <td>8.088483</td>\n",
" <td>-15.090201</td>\n",
" <td>8.375166</td>\n",
" <td>1.727225</td>\n",
" <td>-6.601129</td>\n",
" <td>11.317413</td>\n",
" <td>11.544603</td>\n",
" <td>2.478689</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 384 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3 4 5 \\\n",
"0 125.983301 95.636485 42.114702 95.449980 -37.386295 39.400078 \n",
"1 -106.871904 80.290331 79.066297 59.302092 -42.175328 117.616655 \n",
"2 7.072875 15.513378 1.846914 85.937583 -33.808811 94.702337 \n",
"3 39.421531 44.136989 -24.010929 85.265863 -0.339022 -9.323137 \n",
"4 31.950101 62.854106 1.778164 36.218768 -45.130875 66.674880 \n",
"\n",
" 6 7 8 9 ... 374 \\\n",
"0 -148.116070 -87.851475 110.371966 62.272814 ... 16.165592 \n",
"1 -144.364237 -127.131513 22.962533 25.397575 ... -4.901128 \n",
"2 -122.256856 -114.009530 53.922293 60.131814 ... 8.359966 \n",
"3 -60.499651 -37.044763 49.407848 -23.350150 ... 3.311411 \n",
"4 -106.342341 -22.901008 59.835938 62.663961 ... -2.403870 \n",
"\n",
" 375 376 377 378 379 380 \\\n",
"0 33.030668 7.019996 -14.793959 15.437511 8.199658 -25.070834 \n",
"1 -4.565393 41.520751 -0.727564 -16.413776 -7.373778 2.638877 \n",
"2 -2.165985 10.936580 -16.531660 14.681230 15.633759 -1.210901 \n",
"3 3.788879 13.398598 -6.592596 6.437365 5.993293 2.732392 \n",
"4 11.991204 8.088483 -15.090201 8.375166 1.727225 -6.601129 \n",
"\n",
" 381 382 383 \n",
"0 1.571619 1.603738 0.305645 \n",
"1 -7.403457 2.703070 0.408040 \n",
"2 14.183826 11.703135 10.148075 \n",
"3 -3.727647 5.614115 6.023693 \n",
"4 11.317413 11.544603 2.478689 \n",
"\n",
"[5 rows x 384 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "oTvS35ewS26k",
"outputId": "edddde5a-6130-4ba1-c91e-27677afc50dd"
},
"source": [
"print(\"Number of features in nlp dataframe :\", df1.shape[1])\n",
"print(\"Number of features in preprocessed dataframe :\", df2.shape[1])\n",
"print(\"Number of features in question1 w2v dataframe :\", df3_q1.shape[1])\n",
"print(\"Number of features in question2 w2v dataframe :\", df3_q2.shape[1])\n",
"print(\"Number of features in final dataframe :\", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Number of features in nlp dataframe : 17\n",
"Number of features in preprocessed dataframe : 12\n",
"Number of features in question1 w2v dataframe : 384\n",
"Number of features in question2 w2v dataframe : 384\n",
"Number of features in final dataframe : 794\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"collapsed": true,
"id": "MEWzJdI7S26l"
},
"source": [
"# storing the final features to csv file\n",
"if not os.path.isfile('final_features.csv'):\n",
" df3_q1['id']=df1['id']\n",
" df3_q2['id']=df1['id']\n",
" df1 = df1.merge(df2, on='id',how='left')\n",
" df2 = df3_q1.merge(df3_q2, on='id',how='left')\n",
" result = df1.merge(df2, on='id',how='left')\n",
" result.to_csv('final_features.csv')"
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment