Last active
December 19, 2015 09:45
-
-
Save mz1988/963479a188cfab7b7cc6 to your computer and use it in GitHub Desktop.
Persian (farsi) Text Normalizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
from collections import Counter | |
class Farsi: | |
PUNCTS = r"""'".,/?\!@#$%^&*()_\-+~`:;{}\[\]""" | |
LOCAL_PUNCTS = '،!٪:؛“”‘ـ٬؟' | |
CHARMAP = str.maketrans( | |
{'\u064a': '\u06cc', # yeh | |
'\u0649': '\u06cc', # yeh | |
'\u0643': '\u06a9', # keh | |
# '\xa0': ' ', # non-break space | |
'\u0651': None, # tashdid | |
'\u0652': None, # sukon (gerd) | |
'\u064b': None, # fathatan | |
'\u064f': None, # oh | |
'\u064e': None, # fatha | |
'\u0650': None, # kasra | |
'\u0640': None, # kashida __ | |
'\u0623': '\u0627', # Alef hamza to alef | |
# half spaces' | |
'\u200c': ' ', | |
'\u200e': ' ', | |
'\u200f': ' ', | |
# bad spaces | |
'\xa0': ' ', | |
'\r': None, | |
# numbers | |
'۱': '1', '۲': '2', '۳': '3', '۴': '4', '۵': '5', '۶': '6', | |
'۷': '7', '۸': '8', '۹': '9', '۰': '0', | |
# punct | |
'،': ',', '!': '!', '؟': '?', '؛': ';', | |
'٪': '%', '٬': '\"', 'ـ': '_', '»': '\"', | |
'«': '\"', '”': '\"', '“': '\"', '‘': '\"', | |
'’': '\"', '|': None}) | |
WORD_ENDS = ['انه', 'ها', 'ان', 'ات', 'مین', 'ین', 'یت', 'ترین'] | |
TAGS_RE = re.compile('\<.*?\>') | |
def __init__(self): | |
self._stem_cache = {} | |
self._stem_cache_hit = 0 | |
self._stem_cache_miss = 0 | |
# compiling re | |
self.punct_re = re.compile('[{0}]'.format(self.PUNCTS + | |
self.LOCAL_PUNCTS)) | |
# loading word_set | |
words_set = set() | |
words_path = ['dicts/fa-spell-checking.dic', 'dicts/fa-additional.dic'] | |
for path in words_path: | |
path = os.path.join(os.path.dirname(__file__), path) | |
with open(path) as f: | |
words = self.normalize(f.read()) | |
for word in words.split('\n'): | |
words_set.add(word) | |
self.words_set = words_set | |
@classmethod | |
def normalize(cls, text): | |
"""Normalize Text | |
Removing unneeded characters and replacing some with better | |
alternative | |
""" | |
#text = map(lambda c: self.CHARMAP.get(c, c), text) | |
#return ''.join(text) | |
return text.translate(cls.CHARMAP) | |
def sentences(self, text): | |
pass | |
def words(self, text): | |
text = self.punct_re.sub(' ', text) | |
words = map(lambda t: t.strip().strip('\u200c'), text.split()) | |
return filter(lambda t: t not in ('', '\n'), words) | |
def stem_word(self, word): | |
if word not in self._stem_cache: | |
self._stem_cache_miss += 1 | |
self._stem_cache[word] = self._stem_word_uncached(word) | |
else: | |
self._stem_cache_hit += 1 | |
return self._stem_cache[word] | |
def _stem_word_uncached(self, word): | |
# todo mehvar(iat), baghat, baghati | |
if word in self.words_set: | |
return True, word | |
# check without trailing char like heh, yeh, she | |
if word[0:-1] in self.words_set: | |
return True, word[0:-1] | |
nhs_word = word.replace('\u200c', '') | |
# check without halfspace | |
if nhs_word in self.words_set: | |
return True, nhs_word | |
# remove trailing yeh | |
if nhs_word.endswith('ی'): | |
nhs_word = nhs_word[0:-1] | |
if nhs_word in self.words_set: | |
return True, nhs_word | |
# check after removing WORDS_ENDS | |
for end in self.WORD_ENDS: | |
if (nhs_word.endswith(end) and | |
nhs_word[0:-len(end)] in self.words_set): | |
return True, nhs_word[0:-len(end)] | |
# check after replacing "gan" with heh: "namayandegan" -> "namayande" | |
if nhs_word.endswith('گان'): | |
nhs_word = nhs_word[0:-3] | |
if nhs_word in self.words_set: | |
return True, nhs_word | |
nhs_word += 'ه' | |
if nhs_word in self.words_set: | |
return True, nhs_word | |
return False, word | |
def remove_tags(self, text): | |
try: | |
return self.TAGS_RE.sub(' ', text) | |
except: | |
print('error in remove tags', type(text)) | |
return '' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment