AMR-KELEG/stopwords-removal.ipynb

## stopwords-removal.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remove Arabizi stopwords\n",
    "- This notebook is based on my repository https://github.com/AMR-KELEG/Franco-Arabic-Transliterator\n",
    "    - The repository is still experimental so your feedback is much appreciated\n",
    "- The idea of this notebook is to transliterate Arabizi tokens into Arabic, match them with an Arabic stopwords list and drop the Arabizi tokens whose transliterations are found in the Arabic stopwords list\n",
    "- The effect of fitlering stopwords using this technique wasn't tested so I would really appreciate if you can let me know if it's useful or not and how to improve it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install the package\n",
    "! pip install -U https://github.com/AMR-KELEG/Franco-Arabic-Transliterator/archive/support-tunisian.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import logging\n",
    "\n",
    "logging.basicConfig()\n",
    "logging.getLogger().setLevel(logging.ERROR)\n",
    "from franco_arabic_transliterator.franco_arabic_transliterator import FrancoArabicTransliterator\n",
    "\n",
    "def preprocess_token(s):\n",
    "    # Shorten repeated characters to avoid slow transliteration speed\n",
    "    s = re.sub(r\"(\\w)(\\1){2,}\", r\"\\1\\1\", s)\n",
    "    s = re.sub(r\"(?:(\\w)(\\w))(\\1\\2){2,}\", r\"\\1\\2\\1\\2\\1\\2\", s)\n",
    "    s = re.sub(r\"(?:(\\w)(\\w)(\\w))(\\1\\2\\3){2,}\", r\"\\1\\2\\3\\1\\2\\3\", s)\n",
    "    # Normalize laughing token\n",
    "    s = re.sub(r\"h[ha]{3,}\", \"hahaha\", s)\n",
    "    # Truncate the token to max length of 10 characters\n",
    "    return s[:10]\n",
    "\n",
    "def remove_stopwords_from_sentence(arabizi_sentence):\n",
    "    # Perform whitespace tokenization\n",
    "    tokens = arabizi_sentence.split()\n",
    "    \n",
    "    # Preprocess token to avoid slow transliteration\n",
    "    preprocessed_tokens = [preprocess_token(t) for t in tokens]\n",
    "    \n",
    "    # Transliterate Arabizi tokens into Arabic\n",
    "    transliterated_tokens = [\n",
    "        transliterator.transliterate(t, method=\"lexicon\") for t in preprocessed_tokens]\n",
    "    # Drop stopwords\n",
    "    non_stopword_tokens = [token if transliterated_token not in arabic_stopwords else \"\" for token, transliterated_token in zip(tokens, transliterated_tokens)]\n",
    "    return \" \".join([token for token in non_stopword_tokens if token])\n",
    "\n",
    "transliterator = FrancoArabicTransliterator()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add the stopwords that you would like to use here\n",
    "# Add both normalized and unnormalized versions of each token to ensure that they are dropped\n",
    "# i.e: أنت and انت should both be included in the list\n",
    "arabic_stopwords = ['و', 'في', 'يا', 'انت', 'أنت']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'shnoa ragol ?'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "remove_stopwords_from_sentence(\"shnoa ente ya ragol ?\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Apply the function to the training dataset\n",
    "- Doing so takes from 20 to 40 minutes so I advise you to save a preprocessed version of the data on disk to avoid rerunning the function each time you are training a new model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "tqdm.pandas()\n",
    "\n",
    "train_df = pd.read_csv(\"data/Train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 70000/70000 [23:05<00:00, 50.52it/s]  \n"
     ]
    }
   ],
   "source": [
    "train_df[\"preprocessed_text\"] = train_df[\"text\"].progress_apply(remove_stopwords_from_sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>preprocessed_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12633</th>\n",
       "      <td>allah 3lik ya melik</td>\n",
       "      <td>allah 3lik melik</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>69998</th>\n",
       "      <td>fachel enta w houwa</td>\n",
       "      <td>fachel houwa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24329</th>\n",
       "      <td>8 min london</td>\n",
       "      <td>8 min london</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      text preprocessed_text\n",
       "12633  allah 3lik ya melik  allah 3lik melik\n",
       "69998  fachel enta w houwa      fachel houwa\n",
       "24329         8 min london      8 min london"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df[[\"text\", \"preprocessed_text\"]].loc[[12633, 69998, 24329]]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Remove Arabizi stopwords\n",
	"- This notebook is based on my repository https://github.com/AMR-KELEG/Franco-Arabic-Transliterator\n",
	" - The repository is still experimental so your feedback is much appreciated\n",
	"- The idea of this notebook is to transliterate Arabizi tokens into Arabic, match them with an Arabic stopwords list and drop the Arabizi tokens whose transliterations are found in the Arabic stopwords list\n",
	"- The effect of fitlering stopwords using this technique wasn't tested so I would really appreciate if you can let me know if it's useful or not and how to improve it"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Install the package\n",
	"! pip install -U https://github.com/AMR-KELEG/Franco-Arabic-Transliterator/archive/support-tunisian.zip"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"import re\n",
	"import logging\n",
	"\n",
	"logging.basicConfig()\n",
	"logging.getLogger().setLevel(logging.ERROR)\n",
	"from franco_arabic_transliterator.franco_arabic_transliterator import FrancoArabicTransliterator\n",
	"\n",
	"def preprocess_token(s):\n",
	" # Shorten repeated characters to avoid slow transliteration speed\n",
	" s = re.sub(r\"(\\w)(\\1){2,}\", r\"\\1\\1\", s)\n",
	" s = re.sub(r\"(?:(\\w)(\\w))(\\1\\2){2,}\", r\"\\1\\2\\1\\2\\1\\2\", s)\n",
	" s = re.sub(r\"(?:(\\w)(\\w)(\\w))(\\1\\2\\3){2,}\", r\"\\1\\2\\3\\1\\2\\3\", s)\n",
	" # Normalize laughing token\n",
	" s = re.sub(r\"h[ha]{3,}\", \"hahaha\", s)\n",
	" # Truncate the token to max length of 10 characters\n",
	" return s[:10]\n",
	"\n",
	"def remove_stopwords_from_sentence(arabizi_sentence):\n",
	" # Perform whitespace tokenization\n",
	" tokens = arabizi_sentence.split()\n",
	" \n",
	" # Preprocess token to avoid slow transliteration\n",
	" preprocessed_tokens = [preprocess_token(t) for t in tokens]\n",
	" \n",
	" # Transliterate Arabizi tokens into Arabic\n",
	" transliterated_tokens = [\n",
	" transliterator.transliterate(t, method=\"lexicon\") for t in preprocessed_tokens]\n",
	" # Drop stopwords\n",
	" non_stopword_tokens = [token if transliterated_token not in arabic_stopwords else \"\" for token, transliterated_token in zip(tokens, transliterated_tokens)]\n",
	" return \" \".join([token for token in non_stopword_tokens if token])\n",
	"\n",
	"transliterator = FrancoArabicTransliterator()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Add the stopwords that you would like to use here\n",
	"# Add both normalized and unnormalized versions of each token to ensure that they are dropped\n",
	"# i.e: أنت and انت should both be included in the list\n",
	"arabic_stopwords = ['و', 'في', 'يا', 'انت', 'أنت']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'shnoa ragol ?'"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"remove_stopwords_from_sentence(\"shnoa ente ya ragol ?\")"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Apply the function to the training dataset\n",
	"- Doing so takes from 20 to 40 minutes so I advise you to save a preprocessed version of the data on disk to avoid rerunning the function each time you are training a new model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"from tqdm import tqdm\n",
	"tqdm.pandas()\n",
	"\n",
	"train_df = pd.read_csv(\"data/Train.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"100%\|██████████\| 70000/70000 [23:05<00:00, 50.52it/s] \n"
	]
	}
	],
	"source": [
	"train_df[\"preprocessed_text\"] = train_df[\"text\"].progress_apply(remove_stopwords_from_sentence)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>text</th>\n",
	" <th>preprocessed_text</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>12633</th>\n",
	" <td>allah 3lik ya melik</td>\n",
	" <td>allah 3lik melik</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>69998</th>\n",
	" <td>fachel enta w houwa</td>\n",
	" <td>fachel houwa</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>24329</th>\n",
	" <td>8 min london</td>\n",
	" <td>8 min london</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" text preprocessed_text\n",
	"12633 allah 3lik ya melik allah 3lik melik\n",
	"69998 fachel enta w houwa fachel houwa\n",
	"24329 8 min london 8 min london"
	]
	},
	"execution_count": 16,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train_df[[\"text\", \"preprocessed_text\"]].loc[[12633, 69998, 24329]]"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.6"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}