Skip to content

Instantly share code, notes, and snippets.

@msoutopico
Last active February 8, 2020 18:16
Show Gist options
  • Save msoutopico/9c3c8fab6a99db103dc98580254bc6a6 to your computer and use it in GitHub Desktop.
Save msoutopico/9c3c8fab6a99db103dc98580254bc6a6 to your computer and use it in GitHub Desktop.
Arabic text to numeric representation
# -*- coding: utf-8 -*-
from lang_trans.arabic import buckwalter
import re
from odf import text, teletype
from odf.opendocument import load
# consonants
solar = r"[tvd*rzs$SDTZln]"
lunar = r"['bjHxEgqfkmh]"
hamza = r"[<>]"
def harf2bin(line):
line = line.strip()
# pre-processing
# e.g. اللَّهِ => ألْلَاْهِيْ
# al
line = re.sub(r"Al" + solar, "1", line)
line = re.sub(r"Alo?" + lunar, "10", line)
# tanween
line = re.sub(r"[NFK]", "10", line)
# long vowels
line = re.sub(r"[uia][wyYA]", "10", line)
# ashba3
line = re.sub(r"[uia]$", "10", line)
# short vowels
line = re.sub(r"[uia]", "1", line)
# sukun
line = re.sub(r"o", "0", line)
# shadda
line = re.sub(r"~", "10", line)
# remove consonants
line = re.sub(r"(" + lunar + "|" + solar + "|" + hamza + ")", "", line)
# remove long vowels used as consonants
line = re.sub(r"[wyYA]1", "", line)
return line
# load text from line by parameter or fetch from the cloud
textdoc = load("Sample Text.odt")
paras = textdoc.getElementsByType(text.P)
for p in paras:
t = teletype.extractText(p)
b = buckwalter.transliterate(t)
print(b)
print(harf2bin(b))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment