Last active
January 28, 2022 13:21
-
-
Save Hegghammer/f6b10677a03416642caae6426912eed9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "4ea1b8d9-a658-4281-902f-0015b9b1012e", | |
"metadata": {}, | |
"source": [ | |
"# Arabic text preprocessing with Camel Tools and parallelization\n", | |
"\n", | |
"A basic NLP preprocessing pipeline for Arabic text that:\n", | |
"\n", | |
"- removes punctuation and symbols\n", | |
"- removes newline characters\n", | |
"- removes latin characters\n", | |
"- removes numbers\n", | |
"- removes extra whitespaces\n", | |
"- normalizes unicode characters\n", | |
"- converts extended Arabic letters into basic ones\n", | |
"- normalizes alif maksuras and ta marbutas\n", | |
"- removes diacritics\n", | |
"- removes stopwords\n", | |
"- lemmatizes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "4be96887-fb3b-4fe8-855f-f0546bfd9b74", | |
"metadata": {}, | |
"source": [ | |
"## Load libraries" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"id": "3860c01d-6248-49d5-b32a-473f458ca946", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import regex as re\n", | |
"import string\n", | |
"import nltk\n", | |
"import requests\n", | |
"from camel_tools.utils.normalize import normalize_unicode\n", | |
"from camel_tools.utils.charmap import CharMapper\n", | |
"from camel_tools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET\n", | |
"from camel_tools.utils.dediac import dediac_ar\n", | |
"from camel_tools.utils.normalize import normalize_alef_maksura_ar\n", | |
"from camel_tools.utils.normalize import normalize_alef_ar\n", | |
"from camel_tools.utils.normalize import normalize_teh_marbuta_ar\n", | |
"from camel_tools.disambig.mle import MLEDisambiguator" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "bd8c0b2e-239a-434f-9b12-d3d8b8802088", | |
"metadata": {}, | |
"source": [ | |
"## Load dataframe\n", | |
"\n", | |
"A dataframe with text items as rows, various metadata columns, and a column called 'text' where the values are strings." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"id": "9058ed8b-89db-4c53-8faf-2800dfe25162", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv('unprocessed.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "cdb3f185-c845-46cc-98fb-a2b910949a9d", | |
"metadata": {}, | |
"source": [ | |
"## Load tools" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "aa996bc0-6de6-4a1d-bccd-5b85187bd7b3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Mohamed Taher's stopword list\n", | |
"file = requests.get(\"https://raw.githubusercontent.com/mohataher/arabic-stop-words/master/list.txt\")\n", | |
"stopwords = file.text\n", | |
"stopwords = stopwords.split()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"id": "c9d72114-bc21-40bf-8f50-67358e6763af", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Camel tools arclean utility and disambiguator model\n", | |
"arclean = CharMapper.builtin_mapper('arclean')\n", | |
"mle = MLEDisambiguator.pretrained()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "f5fd2201-1d9f-472a-9dea-2ba252eed41c", | |
"metadata": {}, | |
"source": [ | |
"## Create custom functions" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "6501ab83-5514-4274-b484-2bbaefe82fb3", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def rem_punct(text):\n", | |
" return ''.join(c for c in text if c not in UNICODE_PUNCT_SYMBOL_CHARSET)\n", | |
"\n", | |
"def rem_newline(text):\n", | |
" return re.sub('\\n', ' ', text)\n", | |
"\n", | |
"def rem_latin(text):\n", | |
" return re.sub('[\\p{Latin}]', u'', text)\n", | |
"\n", | |
"def rem_num(text):\n", | |
" return ''.join(c for c in text if not c.isdigit())\n", | |
"\n", | |
"def rem_spaces(text):\n", | |
" return ' '.join(text.split())\n", | |
"\n", | |
"def rem_stop(text):\n", | |
" return ' '.join(c for c in text.split() if c not in stopwords)\n", | |
"\n", | |
"def get_lemmas(tokenlist):\n", | |
" disamb = mle.disambiguate(tokenlist)\n", | |
" return [d.analyses[0].analysis['lex'] for d in disamb]\n", | |
" \n", | |
"def make_string(tokenlist):\n", | |
" return \" \".join(tokenlist)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1f4d9187-f45d-47f9-a320-021f586bc562", | |
"metadata": {}, | |
"source": [ | |
"## Merge with exisisting functions " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "939621f8-15df-4fea-8b00-e7389d4a2a3e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def preprocess(text):\n", | |
" text = rem_punct(text)\n", | |
" text = rem_newline(text)\n", | |
" text = rem_latin(text)\n", | |
" text = rem_num(text)\n", | |
" text = rem_spaces(text)\n", | |
" text = normalize_unicode(text)\n", | |
" text = arclean(text)\n", | |
" text = normalize_alef_maksura_ar(text)\n", | |
" text = normalize_alef_ar(text)\n", | |
" text = normalize_teh_marbuta_ar(text)\n", | |
" text = rem_stop(text)\n", | |
" text = dediac_ar(text)\n", | |
" text = nltk.word_tokenize(text)\n", | |
" text = get_lemmas(text)\n", | |
" text = make_string(text)\n", | |
" return text" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b6f6496c-c926-48d7-b013-e24eda67bb7e", | |
"metadata": {}, | |
"source": [ | |
"## Apply to dataframe" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "0f97876c-8c2c-4739-b0db-bfc436cabcd6", | |
"metadata": {}, | |
"source": [ | |
"### Option 1: The regular way\n", | |
"Uses a single core. Can be slow on large datasets (~15 mins per million words on my 4.3Ghz CPU with 64Gb RAM)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "3828b3f4-31fe-488d-b272-f611c9253507", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.text_cleaned = df.text.apply(preprocess)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "1038fb5c-246f-4414-be0d-d7fa0cb882ee", | |
"metadata": {}, | |
"source": [ | |
"### Option 2: With parallelization\n", | |
"The faster the more cores you have." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "8c669370-c518-4c11-b084-83788a36994f", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from pandarallel import pandarallel\n", | |
"pandarallel.initialize(progress_bar = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "91d15935-57bf-4e8b-b338-017da6b1d134", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.text = df.text.parallel_apply(preprocess)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "433a0146-266f-4d93-b25d-58547bf455b5", | |
"metadata": {}, | |
"source": [ | |
"## Save" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "2cf99d15-92ac-4150-b849-b0ebd96ee750", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.to_csv('processed.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "b8932ccf-6f11-4c4b-ba58-84e4e35a5c0e", | |
"metadata": {}, | |
"source": [ | |
"## References\n", | |
"\n", | |
"- Camel Tools homepage: https://github.com/CAMeL-Lab/camel_tools\n", | |
"- Documentation: https://camel-tools.readthedocs.io/en/latest/index.html\n", | |
"- Official vignette: https://colab.research.google.com/drive/1Y3qCbD6Gw1KEw-lixQx1rI6WlyWnrnDS?usp=sharing\n", | |
"- Other examples: \n", | |
" - (2021) https://towardsdatascience.com/arabic-nlp-unique-challenges-and-their-solutions-d99e8a87893d?s=09\n", | |
" - (2020) https://hajar-iba.medium.com/camel-tools-a-python-toolkit-for-arabic-nlp-ba9f1d2e8cb7\n", | |
" - (2020) https://eng-shamsan.medium.com/step-by-step-how-to-use-camel-tools-for-arabic-language-processing-tokenization-86c0de5b2817\n", | |
"- Mohammed Taher's stopword list: https://github.com/mohataher/arabic-stop-words\n", | |
"- Pandarallel homepage: https://github.com/nalepae/pandarallel" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment