Skip to content

Instantly share code, notes, and snippets.

@mz1988
Last active December 19, 2015 09:45
Show Gist options
  • Save mz1988/963479a188cfab7b7cc6 to your computer and use it in GitHub Desktop.
Save mz1988/963479a188cfab7b7cc6 to your computer and use it in GitHub Desktop.
Persian (farsi) Text Normalizer
import re
import os
from collections import Counter
class Farsi:
PUNCTS = r"""'".,/?\!@#$%^&*()_\-+~`:;{}\[\]"""
LOCAL_PUNCTS = '،!٪:؛“”‘ـ٬؟'
CHARMAP = str.maketrans(
{'\u064a': '\u06cc', # yeh
'\u0649': '\u06cc', # yeh
'\u0643': '\u06a9', # keh
# '\xa0': ' ', # non-break space
'\u0651': None, # tashdid
'\u0652': None, # sukon (gerd)
'\u064b': None, # fathatan
'\u064f': None, # oh
'\u064e': None, # fatha
'\u0650': None, # kasra
'\u0640': None, # kashida __
'\u0623': '\u0627', # Alef hamza to alef
# half spaces'
'\u200c': ' ',
'\u200e': ' ',
'\u200f': ' ',
# bad spaces
'\xa0': ' ',
'\r': None,
# numbers
'۱': '1', '۲': '2', '۳': '3', '۴': '4', '۵': '5', '۶': '6',
'۷': '7', '۸': '8', '۹': '9', '۰': '0',
# punct
'،': ',', '!': '!', '؟': '?', '؛': ';',
'٪': '%', '٬': '\"', 'ـ': '_', '»': '\"',
'«': '\"', '”': '\"', '“': '\"', '‘': '\"',
'’': '\"', '|': None})
WORD_ENDS = ['انه', 'ها', 'ان', 'ات', 'مین', 'ین', 'یت', 'ترین']
TAGS_RE = re.compile('\<.*?\>')
def __init__(self):
self._stem_cache = {}
self._stem_cache_hit = 0
self._stem_cache_miss = 0
# compiling re
self.punct_re = re.compile('[{0}]'.format(self.PUNCTS +
self.LOCAL_PUNCTS))
# loading word_set
words_set = set()
words_path = ['dicts/fa-spell-checking.dic', 'dicts/fa-additional.dic']
for path in words_path:
path = os.path.join(os.path.dirname(__file__), path)
with open(path) as f:
words = self.normalize(f.read())
for word in words.split('\n'):
words_set.add(word)
self.words_set = words_set
@classmethod
def normalize(cls, text):
"""Normalize Text
Removing unneeded characters and replacing some with better
alternative
"""
#text = map(lambda c: self.CHARMAP.get(c, c), text)
#return ''.join(text)
return text.translate(cls.CHARMAP)
def sentences(self, text):
pass
def words(self, text):
text = self.punct_re.sub(' ', text)
words = map(lambda t: t.strip().strip('\u200c'), text.split())
return filter(lambda t: t not in ('', '\n'), words)
def stem_word(self, word):
if word not in self._stem_cache:
self._stem_cache_miss += 1
self._stem_cache[word] = self._stem_word_uncached(word)
else:
self._stem_cache_hit += 1
return self._stem_cache[word]
def _stem_word_uncached(self, word):
# todo mehvar(iat), baghat, baghati
if word in self.words_set:
return True, word
# check without trailing char like heh, yeh, she
if word[0:-1] in self.words_set:
return True, word[0:-1]
nhs_word = word.replace('\u200c', '')
# check without halfspace
if nhs_word in self.words_set:
return True, nhs_word
# remove trailing yeh
if nhs_word.endswith('ی'):
nhs_word = nhs_word[0:-1]
if nhs_word in self.words_set:
return True, nhs_word
# check after removing WORDS_ENDS
for end in self.WORD_ENDS:
if (nhs_word.endswith(end) and
nhs_word[0:-len(end)] in self.words_set):
return True, nhs_word[0:-len(end)]
# check after replacing "gan" with heh: "namayandegan" -> "namayande"
if nhs_word.endswith('گان'):
nhs_word = nhs_word[0:-3]
if nhs_word in self.words_set:
return True, nhs_word
nhs_word += 'ه'
if nhs_word in self.words_set:
return True, nhs_word
return False, word
def remove_tags(self, text):
try:
return self.TAGS_RE.sub(' ', text)
except:
print('error in remove tags', type(text))
return ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment