Skip to content

Instantly share code, notes, and snippets.

@linuxscout
Created December 15, 2022 21:34
Show Gist options
  • Save linuxscout/bb1eff5f553c2167fc6e6bbf15a48b9a to your computer and use it in GitHub Desktop.
Save linuxscout/bb1eff5f553c2167fc6e6bbf15a48b9a to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding=utf-8 -*-
"""
Example on arabic text cleaning using PyArbic Library
Requirements: pip install pyarabic
Data: csv text file contains (tweet, emotion) separated by a ','
Ouput: text file ( cleaned)
"""
import sys
import pyarabic.araby as araby
SEPARATOR = ","
def clean_file(file_name, outfile_name):
# open file with unicode
try:
myfile = open(file_name, encoding="utf8")
except:
print("Can't open file ", file_name)
print("Exit from program ", file_name)
sys.exit()
# read lines from file:
lines = myfile.readlines()
cleaned_lines = []
# for each line, we do treatment to clean text
for text_line in lines:
# select fields to treats
fields = text_line.split(",")
if len(fields) >=2:
tweet = fields[0]
emotion = fields[1]
# conditions to make on text, keep only arabic letters and text
conditions = [araby.is_arabicrange,]
# actions to do on texts,
# remove tashkeel
# remove all kind of diacrtics like small alif, or other marks koranics for example
# remove tatweel
morphs = [araby.strip_tashkeel, araby.strip_diacritics, araby.strip_tatweel]
tokens = araby.tokenize(tweet,conditions= conditions, morphs= morphs)
# tokens is a list, join them to be a cleaned text line
cleaned_tweet = " ".join(tokens)
cleaned_one_line = SEPARATOR.join([tweet, cleaned_tweet, emotion, "\n"])
# append line into a list of cleaned lines
cleaned_lines.append(cleaned_one_line)
# after treatments, write into files
# open destination file
try:
myoutfile = open(outfile_name, "a+", encoding="utf8",)
except:
print("Can't open file to write outputs", outfile_name)
print("Exit from program ", outfile_name)
sys.exit()
# write lines into file
myoutfile.writelines(cleaned_lines)
myoutfile.flush()
# close file
myoutfile.close()
# Main function
if __name__ == '__main__':
file_name = 'text_file.txt'
outfile_name = "output.txt"
clean_file(file_name,outfile_name)
أذكر مشكلة كهذه اعترضتني في مشروع تعريب رسمي، وأظن عندي فكرة عن السبب لذلك يمكن أن أعلق بخصوص الموضوع., True
ايش بدهم من الأطفال؟ شركة ملابس عالمية فارهة تعمل حملات إعلانية تتضمن إيحاءات جنسية وتقديم قرابين من الأطفال للشيطان, True
ستُغلق خدمة Revue، وهي خدمة النشرات البريدية التي استحوذ عليها تويتر ودمجها في منصته، ولن يعود بالإمكان استخدامها.,Postive
يبدو أن إيلون ماسك يركّز على أمور محددة فقط بعد استحواذه على تويتر ولا يريد أن يتابع كل شيء كما كان في السابق., negative
https://getrevue.co, positive
في هذه الحروف فعل رباعي جميع حروفه أصلية.. ما هو؟, negative
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment