RandomForestGump/Pre_processing_quora.ipynb

## Pre_processing_quora.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os\n",
    "import re\n",
    "import time\n",
    "from gensim.models import KeyedVectors\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "from collections import OrderedDict\n",
    "import spacy\n",
    "from gensim.models import KeyedVectors\n",
    "import re\n",
    "import gensim\n",
    "import operator\n",
    "import importlib\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creating a pre processing class for cleansing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class pre_processing:\n",
    "    \n",
    "    #Class declared for step wise pre processing text data \n",
    "    #At each step we monitor our vocab coverage.\n",
    "    \n",
    "    def build_vocab(sentences):\n",
    "        voc={}\n",
    "        print(\"Let's get it started\")\n",
    "        start=time.time()\n",
    "        for sent in sentences:\n",
    "            sent=sent.lower().split(' ')\n",
    "            for word in sent:\n",
    "                try:\n",
    "                    voc[word]+=1\n",
    "                except KeyError:\n",
    "                    voc[word]=1\n",
    "        print(\"--- %s seconds ---\" % (time.time() - start))\n",
    "        return voc\n",
    "\n",
    "    def check_coverage(vocab,embeddings_index):\n",
    "        a = {}\n",
    "        oov = {}\n",
    "        k = 0\n",
    "        i = 0\n",
    "        for word in (vocab):\n",
    "            try:\n",
    "                a[word] = embeddings_index[word]\n",
    "                k += vocab[word]\n",
    "            except:\n",
    "\n",
    "                oov[word] = vocab[word]\n",
    "                i += vocab[word]\n",
    "                pass\n",
    "        print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))\n",
    "        print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))\n",
    "        sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]\n",
    "        return sorted_x[:10]\n",
    "\n",
    "    def eliminate_punct(sents):\n",
    "\n",
    "        x = str(sents)\n",
    "        x=x.lower()\n",
    "        for punct in \"/-'\":\n",
    "            x = x.replace(punct, ' ')\n",
    "        for punct in '&':\n",
    "            x = x.replace(punct, f' {punct} ')\n",
    "        for punct in '?!.,\"#$%\\'()*+-/:;<=>@[\\\\]^_`{|}~' + '“”’':\n",
    "            x = x.replace(punct, '')\n",
    "        return x\n",
    "\n",
    "    def clean_numbers(x):\n",
    "        import re\n",
    "        x = re.sub(r'\\d+', '', x)\n",
    "        return x\n",
    "\n",
    "    def replace_typical_misspell(text):\n",
    "        for key,value in mispel_d.items():\n",
    "            text=text.lower().replace(\" +{}+ \".format(key),\" +{}+ \".format(value))\n",
    "            text=text.lower().replace(\" +{}\".format(key),\" +{}\".format(value))\n",
    "            text=text.lower().replace(\"{}+ \".format(key),\"{}+ \".format(value))                           \n",
    "        return text    \n",
    "\n",
    "    def replace_typical_misspell_(text):\n",
    "        def replace(match):\n",
    "            return mispellings[match.group(0)]\n",
    "        return mispellings_re.sub(replace, text)\n",
    "\n",
    "    def corrector(text):\n",
    "        mispell_dict={'whatsapp':'social medium','instagram':'social medium',  'snapchat':'social medium','quora':'social medium',  'bitcoin':'online currency','cryptocurrency':'online currency','mbbs':'education','bitsat':'education','srm':'college','cgpa':'exam score','btech':'education','aiims':'education', 'ece':'education','iim':'education','mtech':'education','sbi':'bank','iit':'college','iits':'college','iisc':'college','pilani':'college',  'bits':'college', 'vellore':'vellore',  'comedk':'education', 'nift':'college',   'llb':'education',   'kvpy':'education',   'flipkart':'e-commerce',   'paytm':'e-commerce',   'kiit':'college',   'gmat':'education',   'shopify':'e-commerce',   'fiitjee':'education',  'pgdm':'education',  'wbjee':'education',  'airbnb':'e-commerce',  'thapar':'college','viteee':'education',  'pinterest':'social medium',  'redmi':'mobile phone',  'xiaomi':'mobile phone',  'one plus':'mobile phone',  'nokia':'mobile phone',  'upsc':'education',  'banglore':'bangalore',  'colour':'color','centre':'center','favourite':'favorite','travelling':'traveling','counselling':'counseling','theatre':'theater','cancelled':'canceled','labour':'labor','organisation':'organization',\n",
    "                      'wwii':'world war 2',\"can't\" : \"cannot\", \"couldn't\" : \"could not\",\"didn't\" : \"did not\", \"doesn't\" : \"does not\",\"don't\" : \"do not\", \"hadn't\" : \"had not\", \"hasn't\" : \"has not\", \"haven't\" : \"have not\", \"he'd\" : \"he would\", \"he'll\" : \"he will\", \"he's\" : \"he is\", \"i'd\" : \"I would\", \"i'd\" : \"I had\", \"i'll\" : \"I will\", \"i'm\" : \"I am\", \"isn't\" : \"is not\", \"it's\" : \"it is\",\"it'll\":\"it will\",\"i've\" : \"I have\",  \"let's\" : \"let us\",  \"mightn't\" : \"might not\",  \"mustn't\" : \"must not\",  \"shan't\" : \"shall not\",  \"she'd\" : \"she would\",  \"she'll\" : \"she will\", \"she's\" : \"she is\", \"shouldn't\" : \"should not\", \"that's\" : \"that is\", \"there's\" : \"there is\", \"they'd\" : \"they would\", \"they'll\" : \"they will\",  \"they're\" : \"they are\",  \"they've\" : \"they have\",  \"we'd\" : \"we would\",  \"we're\" : \"we are\",\n",
    "                      \"weren't\" : \"were not\",\n",
    " \"we've\" : \"we have\", \"what'll\" : \"what will\", \"what're\" : \"what are\", \"what's\" : \"what is\", \"what've\" : \"what have\", \"where's\" : \"where is\",  \"who'd\" : \"who would\",  \"who'll\" : \"who will\",  \"who're\" : \"who are\",  \"who's\" : \"who is\",  \"who've\" : \"who have\",  \"won't\" : \"will not\", \"wouldn't\" : \"would not\", \"you'd\" : \"you would\", \"you'll\" : \"you will\", \"you're\" : \"you are\", \"you've\" : \"you have\", \"'re\": \" are\", \"wasn't\": \"was not\",\"we'll\":\" will\",\"didn't\": \"did not\",\"tryin'\":\"trying\",\"a\":'',\"to\":'',\"of\":'', \"and\":''}\n",
    "        sent_array=text.split(\" \")\n",
    "        new_array=[]\n",
    "        for word in sent_array:\n",
    "\n",
    "            if word in mispell_dict.keys():\n",
    "                new_array.append(mispell_dict[word])\n",
    "            else:\n",
    "                new_array.append(word)\n",
    "\n",
    "        new_sent=\" \".join(new_array)\n",
    "        return new_sent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings_index = KeyedVectors.load_word2vec_format('..../GoogleNews-vectors-negative300.bin', binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##Pre process"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Baseline: Till now we are covering ~15% of words out of the vocab set, while out of the total vocabulary, our coverage is somewhere around 77%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train=pd.read_csv('C:/Users/oyo/Downloads/train.csv')\n",
    "train_x= train[['question_text']]\n",
    "train_y=train['target'].values\n",
    "\n",
    "vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
    "#Return top 10 words\n",
    "pre_processing.check_coverage(vocab,embeddings_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1st step: After eliminating punctutations we are covering ~36% of words out of the vocab set, while out of the total vocabulary, our coverage is somewhere around 88%, which is a huge jump!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.eliminate_punct(x))\n",
    "vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
    "pre_processing.check_coverage(vocab,embeddings_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2nd step: After eliminating numbers we are covering ~39% of words out of the vocab set which is just mostly due to decrease in quantity of our vocabulary , while out of the total vocabulary coverage has remained constant."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.clean_numbers(x))\n",
    "vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
    "pre_processing.check_coverage(vocab,embeddings_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3rd step: We have tried to group all similar context words into one unknown word here. Our Coverage from all vocab has witnessed a jump"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.corrector(x))\n",
    "vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
    "pre_processing.check_coverage(vocab,embeddings_index)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import warnings\n",
	"warnings.filterwarnings(\"ignore\")\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import os\n",
	"import re\n",
	"import time\n",
	"from gensim.models import KeyedVectors\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import time\n",
	"from collections import OrderedDict\n",
	"import spacy\n",
	"from gensim.models import KeyedVectors\n",
	"import re\n",
	"import gensim\n",
	"import operator\n",
	"import importlib\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Creating a pre processing class for cleansing data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"class pre_processing:\n",
	" \n",
	" #Class declared for step wise pre processing text data \n",
	" #At each step we monitor our vocab coverage.\n",
	" \n",
	" def build_vocab(sentences):\n",
	" voc={}\n",
	" print(\"Let's get it started\")\n",
	" start=time.time()\n",
	" for sent in sentences:\n",
	" sent=sent.lower().split(' ')\n",
	" for word in sent:\n",
	" try:\n",
	" voc[word]+=1\n",
	" except KeyError:\n",
	" voc[word]=1\n",
	" print(\"--- %s seconds ---\" % (time.time() - start))\n",
	" return voc\n",
	"\n",
	" def check_coverage(vocab,embeddings_index):\n",
	" a = {}\n",
	" oov = {}\n",
	" k = 0\n",
	" i = 0\n",
	" for word in (vocab):\n",
	" try:\n",
	" a[word] = embeddings_index[word]\n",
	" k += vocab[word]\n",
	" except:\n",
	"\n",
	" oov[word] = vocab[word]\n",
	" i += vocab[word]\n",
	" pass\n",
	" print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))\n",
	" print('Found embeddings for {:.2%} of all text'.format(k / (k + i)))\n",
	" sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]\n",
	" return sorted_x[:10]\n",
	"\n",
	" def eliminate_punct(sents):\n",
	"\n",
	" x = str(sents)\n",
	" x=x.lower()\n",
	" for punct in \"/-'\":\n",
	" x = x.replace(punct, ' ')\n",
	" for punct in '&':\n",
	" x = x.replace(punct, f' {punct} ')\n",
	" for punct in '?!.,\"#$%\\'()*+-/:;<=>@[\\\\]^_`{\|}~' + '“”’':\n",
	" x = x.replace(punct, '')\n",
	" return x\n",
	"\n",
	" def clean_numbers(x):\n",
	" import re\n",
	" x = re.sub(r'\\d+', '', x)\n",
	" return x\n",
	"\n",
	" def replace_typical_misspell(text):\n",
	" for key,value in mispel_d.items():\n",
	" text=text.lower().replace(\" +{}+ \".format(key),\" +{}+ \".format(value))\n",
	" text=text.lower().replace(\" +{}\".format(key),\" +{}\".format(value))\n",
	" text=text.lower().replace(\"{}+ \".format(key),\"{}+ \".format(value)) \n",
	" return text \n",
	"\n",
	" def replace_typical_misspell_(text):\n",
	" def replace(match):\n",
	" return mispellings[match.group(0)]\n",
	" return mispellings_re.sub(replace, text)\n",
	"\n",
	" def corrector(text):\n",
	" mispell_dict={'whatsapp':'social medium','instagram':'social medium', 'snapchat':'social medium','quora':'social medium', 'bitcoin':'online currency','cryptocurrency':'online currency','mbbs':'education','bitsat':'education','srm':'college','cgpa':'exam score','btech':'education','aiims':'education', 'ece':'education','iim':'education','mtech':'education','sbi':'bank','iit':'college','iits':'college','iisc':'college','pilani':'college', 'bits':'college', 'vellore':'vellore', 'comedk':'education', 'nift':'college', 'llb':'education', 'kvpy':'education', 'flipkart':'e-commerce', 'paytm':'e-commerce', 'kiit':'college', 'gmat':'education', 'shopify':'e-commerce', 'fiitjee':'education', 'pgdm':'education', 'wbjee':'education', 'airbnb':'e-commerce', 'thapar':'college','viteee':'education', 'pinterest':'social medium', 'redmi':'mobile phone', 'xiaomi':'mobile phone', 'one plus':'mobile phone', 'nokia':'mobile phone', 'upsc':'education', 'banglore':'bangalore', 'colour':'color','centre':'center','favourite':'favorite','travelling':'traveling','counselling':'counseling','theatre':'theater','cancelled':'canceled','labour':'labor','organisation':'organization',\n",
	" 'wwii':'world war 2',\"can't\" : \"cannot\", \"couldn't\" : \"could not\",\"didn't\" : \"did not\", \"doesn't\" : \"does not\",\"don't\" : \"do not\", \"hadn't\" : \"had not\", \"hasn't\" : \"has not\", \"haven't\" : \"have not\", \"he'd\" : \"he would\", \"he'll\" : \"he will\", \"he's\" : \"he is\", \"i'd\" : \"I would\", \"i'd\" : \"I had\", \"i'll\" : \"I will\", \"i'm\" : \"I am\", \"isn't\" : \"is not\", \"it's\" : \"it is\",\"it'll\":\"it will\",\"i've\" : \"I have\", \"let's\" : \"let us\", \"mightn't\" : \"might not\", \"mustn't\" : \"must not\", \"shan't\" : \"shall not\", \"she'd\" : \"she would\", \"she'll\" : \"she will\", \"she's\" : \"she is\", \"shouldn't\" : \"should not\", \"that's\" : \"that is\", \"there's\" : \"there is\", \"they'd\" : \"they would\", \"they'll\" : \"they will\", \"they're\" : \"they are\", \"they've\" : \"they have\", \"we'd\" : \"we would\", \"we're\" : \"we are\",\n",
	" \"weren't\" : \"were not\",\n",
	" \"we've\" : \"we have\", \"what'll\" : \"what will\", \"what're\" : \"what are\", \"what's\" : \"what is\", \"what've\" : \"what have\", \"where's\" : \"where is\", \"who'd\" : \"who would\", \"who'll\" : \"who will\", \"who're\" : \"who are\", \"who's\" : \"who is\", \"who've\" : \"who have\", \"won't\" : \"will not\", \"wouldn't\" : \"would not\", \"you'd\" : \"you would\", \"you'll\" : \"you will\", \"you're\" : \"you are\", \"you've\" : \"you have\", \"'re\": \" are\", \"wasn't\": \"was not\",\"we'll\":\" will\",\"didn't\": \"did not\",\"tryin'\":\"trying\",\"a\":'',\"to\":'',\"of\":'', \"and\":''}\n",
	" sent_array=text.split(\" \")\n",
	" new_array=[]\n",
	" for word in sent_array:\n",
	"\n",
	" if word in mispell_dict.keys():\n",
	" new_array.append(mispell_dict[word])\n",
	" else:\n",
	" new_array.append(word)\n",
	"\n",
	" new_sent=\" \".join(new_array)\n",
	" return new_sent"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"embeddings_index = KeyedVectors.load_word2vec_format('..../GoogleNews-vectors-negative300.bin', binary=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"##Pre process"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Baseline: Till now we are covering ~15% of words out of the vocab set, while out of the total vocabulary, our coverage is somewhere around 77%"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"train=pd.read_csv('C:/Users/oyo/Downloads/train.csv')\n",
	"train_x= train[['question_text']]\n",
	"train_y=train['target'].values\n",
	"\n",
	"vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
	"#Return top 10 words\n",
	"pre_processing.check_coverage(vocab,embeddings_index)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 1st step: After eliminating punctutations we are covering ~36% of words out of the vocab set, while out of the total vocabulary, our coverage is somewhere around 88%, which is a huge jump!"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.eliminate_punct(x))\n",
	"vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
	"pre_processing.check_coverage(vocab,embeddings_index)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 2nd step: After eliminating numbers we are covering ~39% of words out of the vocab set which is just mostly due to decrease in quantity of our vocabulary , while out of the total vocabulary coverage has remained constant."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.clean_numbers(x))\n",
	"vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
	"pre_processing.check_coverage(vocab,embeddings_index)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### 3rd step: We have tried to group all similar context words into one unknown word here. Our Coverage from all vocab has witnessed a jump"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"train_x['question_text']=train_x['question_text'].apply(lambda x: pre_processing.corrector(x))\n",
	"vocab= pre_processing.build_vocab(list(train_x['question_text']))\n",
	"pre_processing.check_coverage(vocab,embeddings_index)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.5"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}