Skip to content

Instantly share code, notes, and snippets.

@flutesa
Created February 6, 2014 13:10
Show Gist options
  • Save flutesa/8843841 to your computer and use it in GitHub Desktop.
Save flutesa/8843841 to your computer and use it in GitHub Desktop.
Lingua
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import codecs
import re
def read_file(fileName):
f = codecs.open(fileName,'r','utf8')
source_text = f.read()
f.close()
return source_text.encode('cp1251')
def get_only_words_from_text(text_string):
text_string = text_string.lower()
a = u'а-яё.\' '
pattern = r"[^" + a + "]"
digits_re = re.compile(pattern)
t_s = unicode(text_string,'cp1251')
unwished_symbols = digits_re.findall(t_s)
u_s = set(unwished_symbols)
t_s = t_s.replace(u'.', u' ')
t_s = t_s.replace(u'-', u' ')
for symbol in u_s:
t_s = t_s.replace(symbol, u' ')
w_s = t_s.split()
words_set = set(w_s)
#распечатка массива слов
#for i in words_set:
# print(i.encode('cp1251'))
return words_set
def count_syllables_in_word(word):
a = u'аяоёуюеыиэ'
pattern = r"[" + a + "]"
vowel_letter_re = re.compile(pattern)
vowel_array = vowel_letter_re.findall(word)
return len(vowel_array)
def check_vowels(word):
#if len(word) == 7 and word[-1] == u'а' and word[-1] == u'\'' and ((word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о')):
if len(word) == 6 and word[-1] == u'а' and (word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о'):
return True
else:
return False
'''
if (word[1] == u'а' or word[1] == u'о' or word[3] == u'а' or word[3] == u'о') and word[-1] == u'а' and len(word) == 6:
return True
else:
return False
'''
def check_unwished_consonants(word):
b = u'йлрмн'
pattern = r"[" + b + "]"
consonants_letter_re = re.compile(pattern)
consonants_array = consonants_letter_re.findall(word)
if len(consonants_array) == 0:
return True
else:
return False
#def find_words(file_name):
#source = read_file('statements2.txt')
source = read_file('dict.txt')
source = read_file('polus.txt')
uniq_words_array = get_only_words_from_text(source)
#res = set()
for current_word in uniq_words_array:
if check_vowels(current_word) == True and check_unwished_consonants(current_word) == True:
print(current_word.encode('cp1251'))
#res.add(current_word)
'''
for word in res:
print(word.encode('cp1251'))
word = u'казака'
print(check_vowels(u'казака'))
print(check_unwished_consonants(u'казака'))
print(len(word))
print(word[-1])
print(word[1] == u'а' or word[1] == u'о')
print(word[3] == u'а' or word[3] == u'о')
#print(sys.getdefaultencoding())
#print(locale.getpreferredencoding())
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment