Skip to content

Instantly share code, notes, and snippets.

@stavinsky
Created April 9, 2018 12:46
Show Gist options
  • Save stavinsky/8e21f5010e122abfbdd61ecea3d05b55 to your computer and use it in GitHub Desktop.
Save stavinsky/8e21f5010e122abfbdd61ecea3d05b55 to your computer and use it in GitHub Desktop.
simple ngram for russian texts
import re
class NGram():
pattern = re.compile('[^а-яА-Я ]+')
_ngrams = dict()
_it = 0
def __init__(self):
self.words = dict()
self._it = 0
def clean_text(self, s):
return self.pattern.sub('', s)
def find_ngrams(self, input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def encode(self, string, n=3):
clean_string = self.clean_text(string.lower())
ngrams = self.find_ngrams(clean_string, n)
codes = list()
for ngram in ngrams:
code = self._ngrams.get(ngram, None)
if code is None:
self._ngrams[ngram]=self._it
code = self._it
self._it += 1
codes.append(code)
return codes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment