kiarashvosough1999/ir-hw1.ipynb

## ir-hw1.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true,
      "authorship_tag": "ABX9TyP1YlQCdeLaWsyB2rpgQQqQ",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97/ir-hw1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Download Corpus File From Git"
      ],
      "metadata": {
        "id": "TaP-OrkukqDJ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git\n",
        "!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'"
      ],
      "metadata": {
        "id": "e2FgsbR2kxfd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Loading Required Libs"
      ],
      "metadata": {
        "id": "4nyHr7mrlN8k"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install hazm"
      ],
      "metadata": {
        "id": "NeXr0R3slRhd"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from __future__ import unicode_literals\n",
        "from unicodedata import normalize\n",
        "from hazm import word_tokenize\n",
        "import pandas as pd\n",
        "import csv\n",
        "import numpy as np\n",
        "from hazm import stopwords_list\n",
        "import json\n",
        "import codecs\n",
        "import gzip\n",
        "import re\n",
        "import string"
      ],
      "metadata": {
        "id": "BJU80mcfmHNK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Loading Corpus"
      ],
      "metadata": {
        "id": "pPvn7v9_lSK6"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "raw_df = pd.read_json(\"/content/farsnews.json\", encoding = 'utf-8-sig', lines=True)\n",
        "# raw_df.head()"
      ],
      "metadata": {
        "id": "RacGiX4MlU1t"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "raw_df = raw_df.astype(str)\n",
        "raw_df = raw_df.fillna('')"
      ],
      "metadata": {
        "id": "Dp9WD51iDD_s"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Extracting Category From One Column to 4 Columns"
      ],
      "metadata": {
        "id": "S7sLdJKSQ8sw"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def extract_en_cat(row):\n",
        "  if type(row['CategoryPanel']) is not list: return ''\n",
        "  return row['CategoryPanel'][1]['CategoryEn']"
      ],
      "metadata": {
        "id": "ZBk6ktIv7lTs"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def extract_farsi_cat(row):\n",
        "  if type(row['CategoryPanel']) is not list: return ''\n",
        "  return row['CategoryPanel'][1]['CategoryFa']"
      ],
      "metadata": {
        "id": "CgkuftIA5Poh"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "raw_df[\"first category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
        "raw_df[\"first category en\"] = raw_df.apply(extract_en_cat, axis=1)"
      ],
      "metadata": {
        "id": "1MG8BlTj8Nqg"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "raw_df[\"second category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
        "raw_df[\"second category en\"] = raw_df.apply(extract_en_cat, axis=1)"
      ],
      "metadata": {
        "id": "V43JBUsf-9i-"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Cleaning Data"
      ],
      "metadata": {
        "id": "TyngyAnWQ5Mx"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "columns_to_drop = ['CategoryPanel', 'GetComments', 'CategoryEn', 'CategoryFa', 'CommentsJsonArray']\n",
        "for dr in columns_to_drop:\n",
        "  raw_df.drop(dr, axis=1, inplace=True)"
      ],
      "metadata": {
        "id": "Pf3nRXW82hD_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "new_df = pd.DataFrame() # defin enew df for cleaned data"
      ],
      "metadata": {
        "id": "-Fd8BLjvFHE0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def english_cleaner(row):\n",
        "  speceficChars = ['،', '؛', ':', '(', ')', '«', '»', '…' , '!' , '!!' , '!؟' , '!!!' , '\"' , '؟' , ',' , '+' , '–' , '؟!' , '*' , '-']\n",
        "  text = re.sub('\\r', ' ', row)\n",
        "  text = re.sub('\\n', ' ', text)\n",
        "  text = re.sub('[' + ''.join(speceficChars) + ']', ' ', text)\n",
        "  text = text.lower()\n",
        "  text = text.translate(str.maketrans('', '', string.punctuation))\n",
        "  text = re.sub('\\u200c', ' ', text)\n",
        "  return text\n",
        "\n",
        "columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsDate', 'NewsBody']\n",
        "for col in columns_to_clean:\n",
        "  new_df[col] = raw_df[col].apply(english_cleaner)"
      ],
      "metadata": {
        "id": "5TDgBqpaBLBc"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        " new_df = new_df.astype(str)"
      ],
      "metadata": {
        "id": "nXD3FBZXYeAu"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "sample_df_5000 = new_df.sample(n=5000)"
      ],
      "metadata": {
        "id": "OQzILPlsYoci"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def remove_stopwords(text): # hazm\n",
        "  sw = stopwords_list()\n",
        "  tokenized_text = word_tokenize(text)\n",
        "  filtered_words = list(\n",
        "      filter(\n",
        "          lambda token: True if token not in sw and len(token) > 1 and token else False,\n",
        "          tokenized_text\n",
        "      )\n",
        "  )\n",
        "  return ' '.join(filtered_words) "
      ],
      "metadata": {
        "id": "mUbuJtI0Rj0_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsBody']\n",
        "for col in columns_to_clean:\n",
        "  sample_df_5000[col] = sample_df_5000[col].apply(remove_stopwords)"
      ],
      "metadata": {
        "id": "_Vbpi0EyTG89"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": [],
	"toc_visible": true,
	"authorship_tag": "ABX9TyP1YlQCdeLaWsyB2rpgQQqQ",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU",
	"gpuClass": "standard"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97/ir-hw1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Download Corpus File From Git"
	],
	"metadata": {
	"id": "TaP-OrkukqDJ"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git\n",
	"!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'"
	],
	"metadata": {
	"id": "e2FgsbR2kxfd"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Loading Required Libs"
	],
	"metadata": {
	"id": "4nyHr7mrlN8k"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!pip install hazm"
	],
	"metadata": {
	"id": "NeXr0R3slRhd"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"from __future__ import unicode_literals\n",
	"from unicodedata import normalize\n",
	"from hazm import word_tokenize\n",
	"import pandas as pd\n",
	"import csv\n",
	"import numpy as np\n",
	"from hazm import stopwords_list\n",
	"import json\n",
	"import codecs\n",
	"import gzip\n",
	"import re\n",
	"import string"
	],
	"metadata": {
	"id": "BJU80mcfmHNK"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Loading Corpus"
	],
	"metadata": {
	"id": "pPvn7v9_lSK6"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"raw_df = pd.read_json(\"/content/farsnews.json\", encoding = 'utf-8-sig', lines=True)\n",
	"# raw_df.head()"
	],
	"metadata": {
	"id": "RacGiX4MlU1t"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"raw_df = raw_df.astype(str)\n",
	"raw_df = raw_df.fillna('')"
	],
	"metadata": {
	"id": "Dp9WD51iDD_s"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Extracting Category From One Column to 4 Columns"
	],
	"metadata": {
	"id": "S7sLdJKSQ8sw"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"def extract_en_cat(row):\n",
	" if type(row['CategoryPanel']) is not list: return ''\n",
	" return row['CategoryPanel'][1]['CategoryEn']"
	],
	"metadata": {
	"id": "ZBk6ktIv7lTs"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"def extract_farsi_cat(row):\n",
	" if type(row['CategoryPanel']) is not list: return ''\n",
	" return row['CategoryPanel'][1]['CategoryFa']"
	],
	"metadata": {
	"id": "CgkuftIA5Poh"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"raw_df[\"first category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
	"raw_df[\"first category en\"] = raw_df.apply(extract_en_cat, axis=1)"
	],
	"metadata": {
	"id": "1MG8BlTj8Nqg"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"raw_df[\"second category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
	"raw_df[\"second category en\"] = raw_df.apply(extract_en_cat, axis=1)"
	],
	"metadata": {
	"id": "V43JBUsf-9i-"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"# Cleaning Data"
	],
	"metadata": {
	"id": "TyngyAnWQ5Mx"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"columns_to_drop = ['CategoryPanel', 'GetComments', 'CategoryEn', 'CategoryFa', 'CommentsJsonArray']\n",
	"for dr in columns_to_drop:\n",
	" raw_df.drop(dr, axis=1, inplace=True)"
	],
	"metadata": {
	"id": "Pf3nRXW82hD_"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"new_df = pd.DataFrame() # defin enew df for cleaned data"
	],
	"metadata": {
	"id": "-Fd8BLjvFHE0"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"def english_cleaner(row):\n",
	" speceficChars = ['،', '؛', ':', '(', ')', '«', '»', '…' , '!' , '!!' , '!؟' , '!!!' , '\"' , '؟' , ',' , '+' , '–' , '؟!' , '*' , '-']\n",
	" text = re.sub('\\r', ' ', row)\n",
	" text = re.sub('\\n', ' ', text)\n",
	" text = re.sub('[' + ''.join(speceficChars) + ']', ' ', text)\n",
	" text = text.lower()\n",
	" text = text.translate(str.maketrans('', '', string.punctuation))\n",
	" text = re.sub('\\u200c', ' ', text)\n",
	" return text\n",
	"\n",
	"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsDate', 'NewsBody']\n",
	"for col in columns_to_clean:\n",
	" new_df[col] = raw_df[col].apply(english_cleaner)"
	],
	"metadata": {
	"id": "5TDgBqpaBLBc"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	" new_df = new_df.astype(str)"
	],
	"metadata": {
	"id": "nXD3FBZXYeAu"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"sample_df_5000 = new_df.sample(n=5000)"
	],
	"metadata": {
	"id": "OQzILPlsYoci"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"def remove_stopwords(text): # hazm\n",
	" sw = stopwords_list()\n",
	" tokenized_text = word_tokenize(text)\n",
	" filtered_words = list(\n",
	" filter(\n",
	" lambda token: True if token not in sw and len(token) > 1 and token else False,\n",
	" tokenized_text\n",
	" )\n",
	" )\n",
	" return ' '.join(filtered_words) "
	],
	"metadata": {
	"id": "mUbuJtI0Rj0_"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsBody']\n",
	"for col in columns_to_clean:\n",
	" sample_df_5000[col] = sample_df_5000[col].apply(remove_stopwords)"
	],
	"metadata": {
	"id": "_Vbpi0EyTG89"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}