Skip to content

Instantly share code, notes, and snippets.

@kiarashvosough1999
Created November 1, 2022 09:37
Show Gist options
  • Save kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97 to your computer and use it in GitHub Desktop.
Save kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97 to your computer and use it in GitHub Desktop.
IR-HW1.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"authorship_tag": "ABX9TyP1YlQCdeLaWsyB2rpgQQqQ",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97/ir-hw1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# Download Corpus File From Git"
],
"metadata": {
"id": "TaP-OrkukqDJ"
}
},
{
"cell_type": "code",
"source": [
"!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git\n",
"!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'"
],
"metadata": {
"id": "e2FgsbR2kxfd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Loading Required Libs"
],
"metadata": {
"id": "4nyHr7mrlN8k"
}
},
{
"cell_type": "code",
"source": [
"!pip install hazm"
],
"metadata": {
"id": "NeXr0R3slRhd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from __future__ import unicode_literals\n",
"from unicodedata import normalize\n",
"from hazm import word_tokenize\n",
"import pandas as pd\n",
"import csv\n",
"import numpy as np\n",
"from hazm import stopwords_list\n",
"import json\n",
"import codecs\n",
"import gzip\n",
"import re\n",
"import string"
],
"metadata": {
"id": "BJU80mcfmHNK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Loading Corpus"
],
"metadata": {
"id": "pPvn7v9_lSK6"
}
},
{
"cell_type": "code",
"source": [
"raw_df = pd.read_json(\"/content/farsnews.json\", encoding = 'utf-8-sig', lines=True)\n",
"# raw_df.head()"
],
"metadata": {
"id": "RacGiX4MlU1t"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"raw_df = raw_df.astype(str)\n",
"raw_df = raw_df.fillna('')"
],
"metadata": {
"id": "Dp9WD51iDD_s"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Extracting Category From One Column to 4 Columns"
],
"metadata": {
"id": "S7sLdJKSQ8sw"
}
},
{
"cell_type": "code",
"source": [
"def extract_en_cat(row):\n",
" if type(row['CategoryPanel']) is not list: return ''\n",
" return row['CategoryPanel'][1]['CategoryEn']"
],
"metadata": {
"id": "ZBk6ktIv7lTs"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def extract_farsi_cat(row):\n",
" if type(row['CategoryPanel']) is not list: return ''\n",
" return row['CategoryPanel'][1]['CategoryFa']"
],
"metadata": {
"id": "CgkuftIA5Poh"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"raw_df[\"first category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
"raw_df[\"first category en\"] = raw_df.apply(extract_en_cat, axis=1)"
],
"metadata": {
"id": "1MG8BlTj8Nqg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"raw_df[\"second category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n",
"raw_df[\"second category en\"] = raw_df.apply(extract_en_cat, axis=1)"
],
"metadata": {
"id": "V43JBUsf-9i-"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# Cleaning Data"
],
"metadata": {
"id": "TyngyAnWQ5Mx"
}
},
{
"cell_type": "code",
"source": [
"columns_to_drop = ['CategoryPanel', 'GetComments', 'CategoryEn', 'CategoryFa', 'CommentsJsonArray']\n",
"for dr in columns_to_drop:\n",
" raw_df.drop(dr, axis=1, inplace=True)"
],
"metadata": {
"id": "Pf3nRXW82hD_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"new_df = pd.DataFrame() # defin enew df for cleaned data"
],
"metadata": {
"id": "-Fd8BLjvFHE0"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def english_cleaner(row):\n",
" speceficChars = ['،', '؛', ':', '(', ')', '«', '»', '…' , '!' , '!!' , '!؟' , '!!!' , '\"' , '؟' , ',' , '+' , '–' , '؟!' , '*' , '-']\n",
" text = re.sub('\\r', ' ', row)\n",
" text = re.sub('\\n', ' ', text)\n",
" text = re.sub('[' + ''.join(speceficChars) + ']', ' ', text)\n",
" text = text.lower()\n",
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
" text = re.sub('\\u200c', ' ', text)\n",
" return text\n",
"\n",
"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsDate', 'NewsBody']\n",
"for col in columns_to_clean:\n",
" new_df[col] = raw_df[col].apply(english_cleaner)"
],
"metadata": {
"id": "5TDgBqpaBLBc"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
" new_df = new_df.astype(str)"
],
"metadata": {
"id": "nXD3FBZXYeAu"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"sample_df_5000 = new_df.sample(n=5000)"
],
"metadata": {
"id": "OQzILPlsYoci"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def remove_stopwords(text): # hazm\n",
" sw = stopwords_list()\n",
" tokenized_text = word_tokenize(text)\n",
" filtered_words = list(\n",
" filter(\n",
" lambda token: True if token not in sw and len(token) > 1 and token else False,\n",
" tokenized_text\n",
" )\n",
" )\n",
" return ' '.join(filtered_words) "
],
"metadata": {
"id": "mUbuJtI0Rj0_"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsBody']\n",
"for col in columns_to_clean:\n",
" sample_df_5000[col] = sample_df_5000[col].apply(remove_stopwords)"
],
"metadata": {
"id": "_Vbpi0EyTG89"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment