mz1988/gist:963479a188cfab7b7cc6

## gistfile1.txt
import re
import os
from collections import Counter


class Farsi:

    PUNCTS = r"""'".,/?\!@#$%^&*()_\-+~`:;{}\[\]"""

    LOCAL_PUNCTS = '،!٪:؛“”‘ـ٬؟'

    CHARMAP = str.maketrans(
        {'\u064a': '\u06cc',  # yeh
         '\u0649': '\u06cc',  # yeh
         '\u0643': '\u06a9',  # keh
         # '\xa0': ' ', # non-break space
         '\u0651': None,  # tashdid
         '\u0652': None,  # sukon (gerd)
         '\u064b': None,  # fathatan
         '\u064f': None,  # oh
         '\u064e': None,  # fatha
         '\u0650': None,  # kasra
         '\u0640': None,  # kashida __
         '\u0623': '\u0627',  # Alef hamza to alef
         # half spaces'
         '\u200c': ' ',
         '\u200e': ' ',
         '\u200f': ' ',
         # bad spaces
         '\xa0': ' ',
         '\r': None,
         # numbers
         '۱': '1', '۲': '2', '۳': '3', '۴': '4', '۵': '5', '۶': '6',
         '۷': '7', '۸': '8', '۹': '9', '۰': '0',
         # punct
         '،': ',', '!': '!', '؟': '?', '؛': ';',
         '٪': '%', '٬': '\"', 'ـ': '_', '»': '\"',
         '«': '\"', '”': '\"', '“': '\"', '‘': '\"',
         '’': '\"', '|': None})

    WORD_ENDS = ['انه', 'ها', 'ان', 'ات', 'مین', 'ین', 'یت', 'ترین']

    TAGS_RE = re.compile('\<.*?\>')

    def __init__(self):
        self._stem_cache = {}
        self._stem_cache_hit = 0
        self._stem_cache_miss = 0

        # compiling re
        self.punct_re = re.compile('[{0}]'.format(self.PUNCTS +
                                   self.LOCAL_PUNCTS))

        # loading word_set
        words_set = set()
        words_path = ['dicts/fa-spell-checking.dic', 'dicts/fa-additional.dic']

        for path in words_path:

            path = os.path.join(os.path.dirname(__file__), path)

            with open(path) as f:
                words = self.normalize(f.read())
                for word in words.split('\n'):
                    words_set.add(word)

        self.words_set = words_set

    @classmethod
    def normalize(cls, text):
        """Normalize Text

        Removing unneeded characters and replacing some with better
        alternative
        """
        #text = map(lambda c: self.CHARMAP.get(c, c), text)
        #return ''.join(text)
        return text.translate(cls.CHARMAP)

    def sentences(self, text):
        pass

    def words(self, text):
        text = self.punct_re.sub(' ', text)
        words = map(lambda t: t.strip().strip('\u200c'), text.split())
        return filter(lambda t: t not in ('', '\n'), words)

    def stem_word(self, word):
        if word not in self._stem_cache:
            self._stem_cache_miss += 1
            self._stem_cache[word] = self._stem_word_uncached(word)
        else:
            self._stem_cache_hit += 1
        return self._stem_cache[word]

    def _stem_word_uncached(self, word):
        # todo mehvar(iat), baghat, baghati

        if word in self.words_set:
            return True, word

        # check without trailing char like heh, yeh, she
        if word[0:-1] in self.words_set:
            return True, word[0:-1]

        nhs_word = word.replace('\u200c', '')

        # check without halfspace
        if nhs_word in self.words_set:
            return True, nhs_word

        # remove trailing yeh
        if nhs_word.endswith('ی'):
            nhs_word = nhs_word[0:-1]
            if nhs_word in self.words_set:
                return True, nhs_word

        # check after removing WORDS_ENDS
        for end in self.WORD_ENDS:
            if (nhs_word.endswith(end) and
                    nhs_word[0:-len(end)] in self.words_set):
                return True, nhs_word[0:-len(end)]

        # check after replacing "gan" with heh: "namayandegan" -> "namayande"
        if nhs_word.endswith('گان'):
            nhs_word = nhs_word[0:-3]
            if nhs_word in self.words_set:
                return True, nhs_word
            nhs_word += 'ه'
            if nhs_word in self.words_set:
                return True, nhs_word

        return False, word

    def remove_tags(self, text):
        try:
            return self.TAGS_RE.sub(' ', text)
        except:
            print('error in remove tags', type(text))
            return ''
	import re
	import os
	from collections import Counter


	class Farsi:

	PUNCTS = r"""'".,/?\!@#$%^&*()_\-+~`:;{}\[\]"""

	LOCAL_PUNCTS = '،!٪:؛“”‘ـ٬؟'

	CHARMAP = str.maketrans(
	{'\u064a': '\u06cc', # yeh
	'\u0649': '\u06cc', # yeh
	'\u0643': '\u06a9', # keh
	# '\xa0': ' ', # non-break space
	'\u0651': None, # tashdid
	'\u0652': None, # sukon (gerd)
	'\u064b': None, # fathatan
	'\u064f': None, # oh
	'\u064e': None, # fatha
	'\u0650': None, # kasra
	'\u0640': None, # kashida __
	'\u0623': '\u0627', # Alef hamza to alef
	# half spaces'
	'\u200c': ' ',
	'\u200e': ' ',
	'\u200f': ' ',
	# bad spaces
	'\xa0': ' ',
	'\r': None,
	# numbers
	'۱': '1', '۲': '2', '۳': '3', '۴': '4', '۵': '5', '۶': '6',
	'۷': '7', '۸': '8', '۹': '9', '۰': '0',
	# punct
	'،': ',', '!': '!', '؟': '?', '؛': ';',
	'٪': '%', '٬': '\"', 'ـ': '_', '»': '\"',
	'«': '\"', '”': '\"', '“': '\"', '‘': '\"',
	'’': '\"', '\|': None})

	WORD_ENDS = ['انه', 'ها', 'ان', 'ات', 'مین', 'ین', 'یت', 'ترین']

	TAGS_RE = re.compile('\<.*?\>')

	def __init__(self):
	self._stem_cache = {}
	self._stem_cache_hit = 0
	self._stem_cache_miss = 0

	# compiling re
	self.punct_re = re.compile('[{0}]'.format(self.PUNCTS +
	self.LOCAL_PUNCTS))

	# loading word_set
	words_set = set()
	words_path = ['dicts/fa-spell-checking.dic', 'dicts/fa-additional.dic']

	for path in words_path:

	path = os.path.join(os.path.dirname(__file__), path)

	with open(path) as f:
	words = self.normalize(f.read())
	for word in words.split('\n'):
	words_set.add(word)

	self.words_set = words_set

	@classmethod
	def normalize(cls, text):
	"""Normalize Text

	Removing unneeded characters and replacing some with better
	alternative
	"""
	#text = map(lambda c: self.CHARMAP.get(c, c), text)
	#return ''.join(text)
	return text.translate(cls.CHARMAP)

	def sentences(self, text):
	pass

	def words(self, text):
	text = self.punct_re.sub(' ', text)
	words = map(lambda t: t.strip().strip('\u200c'), text.split())
	return filter(lambda t: t not in ('', '\n'), words)

	def stem_word(self, word):
	if word not in self._stem_cache:
	self._stem_cache_miss += 1
	self._stem_cache[word] = self._stem_word_uncached(word)
	else:
	self._stem_cache_hit += 1
	return self._stem_cache[word]

	def _stem_word_uncached(self, word):
	# todo mehvar(iat), baghat, baghati

	if word in self.words_set:
	return True, word

	# check without trailing char like heh, yeh, she
	if word[0:-1] in self.words_set:
	return True, word[0:-1]

	nhs_word = word.replace('\u200c', '')

	# check without halfspace
	if nhs_word in self.words_set:
	return True, nhs_word

	# remove trailing yeh
	if nhs_word.endswith('ی'):
	nhs_word = nhs_word[0:-1]
	if nhs_word in self.words_set:
	return True, nhs_word

	# check after removing WORDS_ENDS
	for end in self.WORD_ENDS:
	if (nhs_word.endswith(end) and
	nhs_word[0:-len(end)] in self.words_set):
	return True, nhs_word[0:-len(end)]

	# check after replacing "gan" with heh: "namayandegan" -> "namayande"
	if nhs_word.endswith('گان'):
	nhs_word = nhs_word[0:-3]
	if nhs_word in self.words_set:
	return True, nhs_word
	nhs_word += 'ه'
	if nhs_word in self.words_set:
	return True, nhs_word

	return False, word

	def remove_tags(self, text):
	try:
	return self.TAGS_RE.sub(' ', text)
	except:
	print('error in remove tags', type(text))
	return ''