Created
November 1, 2022 09:37
-
-
Save kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97 to your computer and use it in GitHub Desktop.
IR-HW1.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"collapsed_sections": [], | |
"toc_visible": true, | |
"authorship_tag": "ABX9TyP1YlQCdeLaWsyB2rpgQQqQ", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU", | |
"gpuClass": "standard" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/kiarashvosough1999/bdfbe37af56428009b58d15d0ce32a97/ir-hw1.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Download Corpus File From Git" | |
], | |
"metadata": { | |
"id": "TaP-OrkukqDJ" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git\n", | |
"!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'" | |
], | |
"metadata": { | |
"id": "e2FgsbR2kxfd" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Loading Required Libs" | |
], | |
"metadata": { | |
"id": "4nyHr7mrlN8k" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install hazm" | |
], | |
"metadata": { | |
"id": "NeXr0R3slRhd" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from __future__ import unicode_literals\n", | |
"from unicodedata import normalize\n", | |
"from hazm import word_tokenize\n", | |
"import pandas as pd\n", | |
"import csv\n", | |
"import numpy as np\n", | |
"from hazm import stopwords_list\n", | |
"import json\n", | |
"import codecs\n", | |
"import gzip\n", | |
"import re\n", | |
"import string" | |
], | |
"metadata": { | |
"id": "BJU80mcfmHNK" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Loading Corpus" | |
], | |
"metadata": { | |
"id": "pPvn7v9_lSK6" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"raw_df = pd.read_json(\"/content/farsnews.json\", encoding = 'utf-8-sig', lines=True)\n", | |
"# raw_df.head()" | |
], | |
"metadata": { | |
"id": "RacGiX4MlU1t" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"raw_df = raw_df.astype(str)\n", | |
"raw_df = raw_df.fillna('')" | |
], | |
"metadata": { | |
"id": "Dp9WD51iDD_s" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Extracting Category From One Column to 4 Columns" | |
], | |
"metadata": { | |
"id": "S7sLdJKSQ8sw" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def extract_en_cat(row):\n", | |
" if type(row['CategoryPanel']) is not list: return ''\n", | |
" return row['CategoryPanel'][1]['CategoryEn']" | |
], | |
"metadata": { | |
"id": "ZBk6ktIv7lTs" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def extract_farsi_cat(row):\n", | |
" if type(row['CategoryPanel']) is not list: return ''\n", | |
" return row['CategoryPanel'][1]['CategoryFa']" | |
], | |
"metadata": { | |
"id": "CgkuftIA5Poh" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"raw_df[\"first category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n", | |
"raw_df[\"first category en\"] = raw_df.apply(extract_en_cat, axis=1)" | |
], | |
"metadata": { | |
"id": "1MG8BlTj8Nqg" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"raw_df[\"second category fa\"] = raw_df.apply(extract_farsi_cat, axis=1)\n", | |
"raw_df[\"second category en\"] = raw_df.apply(extract_en_cat, axis=1)" | |
], | |
"metadata": { | |
"id": "V43JBUsf-9i-" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Cleaning Data" | |
], | |
"metadata": { | |
"id": "TyngyAnWQ5Mx" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"columns_to_drop = ['CategoryPanel', 'GetComments', 'CategoryEn', 'CategoryFa', 'CommentsJsonArray']\n", | |
"for dr in columns_to_drop:\n", | |
" raw_df.drop(dr, axis=1, inplace=True)" | |
], | |
"metadata": { | |
"id": "Pf3nRXW82hD_" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"new_df = pd.DataFrame() # defin enew df for cleaned data" | |
], | |
"metadata": { | |
"id": "-Fd8BLjvFHE0" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def english_cleaner(row):\n", | |
" speceficChars = ['،', '؛', ':', '(', ')', '«', '»', '…' , '!' , '!!' , '!؟' , '!!!' , '\"' , '؟' , ',' , '+' , '–' , '؟!' , '*' , '-']\n", | |
" text = re.sub('\\r', ' ', row)\n", | |
" text = re.sub('\\n', ' ', text)\n", | |
" text = re.sub('[' + ''.join(speceficChars) + ']', ' ', text)\n", | |
" text = text.lower()\n", | |
" text = text.translate(str.maketrans('', '', string.punctuation))\n", | |
" text = re.sub('\\u200c', ' ', text)\n", | |
" return text\n", | |
"\n", | |
"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsDate', 'NewsBody']\n", | |
"for col in columns_to_clean:\n", | |
" new_df[col] = raw_df[col].apply(english_cleaner)" | |
], | |
"metadata": { | |
"id": "5TDgBqpaBLBc" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
" new_df = new_df.astype(str)" | |
], | |
"metadata": { | |
"id": "nXD3FBZXYeAu" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"sample_df_5000 = new_df.sample(n=5000)" | |
], | |
"metadata": { | |
"id": "OQzILPlsYoci" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"def remove_stopwords(text): # hazm\n", | |
" sw = stopwords_list()\n", | |
" tokenized_text = word_tokenize(text)\n", | |
" filtered_words = list(\n", | |
" filter(\n", | |
" lambda token: True if token not in sw and len(token) > 1 and token else False,\n", | |
" tokenized_text\n", | |
" )\n", | |
" )\n", | |
" return ' '.join(filtered_words) " | |
], | |
"metadata": { | |
"id": "mUbuJtI0Rj0_" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"columns_to_clean = ['NewsSummary', 'NewsTitle', 'NewsBody']\n", | |
"for col in columns_to_clean:\n", | |
" sample_df_5000[col] = sample_df_5000[col].apply(remove_stopwords)" | |
], | |
"metadata": { | |
"id": "_Vbpi0EyTG89" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment