flutesa/st2.py

## st2.py
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import codecs
import re


def read_file(fileName):
    f = codecs.open(fileName,'r','utf8')
    source_text = f.read()
    f.close()
    return source_text.encode('cp1251')


def get_only_words_from_text(text_string):
    text_string = text_string.lower()
    a = u'а-яё.\' '
    pattern = r"[^" + a + "]"
    digits_re = re.compile(pattern)
    t_s = unicode(text_string,'cp1251')
    unwished_symbols = digits_re.findall(t_s)
    u_s = set(unwished_symbols)
    t_s = t_s.replace(u'.', u' ')
    t_s = t_s.replace(u'-', u' ')
    for symbol in u_s:
        t_s = t_s.replace(symbol, u' ')
    w_s = t_s.split()
    words_set = set(w_s)
    #распечатка массива слов
    #for i in words_set:
    #    print(i.encode('cp1251'))
    return words_set


def count_syllables_in_word(word):
    a = u'аяоёуюеыиэ'
    pattern = r"[" + a + "]"
    vowel_letter_re = re.compile(pattern)
    vowel_array = vowel_letter_re.findall(word)
    return len(vowel_array)


def check_vowels(word):
    #if len(word) == 7 and word[-1] == u'а' and word[-1] == u'\'' and ((word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о')):
    if len(word) == 6 and word[-1] == u'а' and (word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о'):
        return True
    else:
        return False
    '''
    if (word[1] == u'а' or word[1] == u'о' or word[3] == u'а' or word[3] == u'о') and word[-1] == u'а' and len(word) == 6:
         return True
    else:
        return False
    '''


def check_unwished_consonants(word):
    b = u'йлрмн'
    pattern = r"[" + b + "]"
    consonants_letter_re = re.compile(pattern)
    consonants_array = consonants_letter_re.findall(word)
    if len(consonants_array) == 0:
        return True
    else:
        return False


#def find_words(file_name):


#source = read_file('statements2.txt')
source = read_file('dict.txt')
source = read_file('polus.txt')
uniq_words_array = get_only_words_from_text(source)


#res = set()
for current_word in uniq_words_array:
    if check_vowels(current_word) == True and check_unwished_consonants(current_word) == True:
        print(current_word.encode('cp1251'))
        #res.add(current_word)
'''
for word in res:
    print(word.encode('cp1251'))

word = u'казака'
print(check_vowels(u'казака'))
print(check_unwished_consonants(u'казака'))
print(len(word))
print(word[-1])
print(word[1] == u'а' or word[1] == u'о')
print(word[3] == u'а' or word[3] == u'о')

#print(sys.getdefaultencoding())
#print(locale.getpreferredencoding())
'''
	#!/usr/local/bin/python
	# -- coding: utf-8 --
	import codecs
	import re


	def read_file(fileName):
	f = codecs.open(fileName,'r','utf8')
	source_text = f.read()
	f.close()
	return source_text.encode('cp1251')


	def get_only_words_from_text(text_string):
	text_string = text_string.lower()
	a = u'а-яё.\' '
	pattern = r"[^" + a + "]"
	digits_re = re.compile(pattern)
	t_s = unicode(text_string,'cp1251')
	unwished_symbols = digits_re.findall(t_s)
	u_s = set(unwished_symbols)
	t_s = t_s.replace(u'.', u' ')
	t_s = t_s.replace(u'-', u' ')
	for symbol in u_s:
	t_s = t_s.replace(symbol, u' ')
	w_s = t_s.split()
	words_set = set(w_s)
	#распечатка массива слов
	#for i in words_set:
	# print(i.encode('cp1251'))
	return words_set


	def count_syllables_in_word(word):
	a = u'аяоёуюеыиэ'
	pattern = r"[" + a + "]"
	vowel_letter_re = re.compile(pattern)
	vowel_array = vowel_letter_re.findall(word)
	return len(vowel_array)


	def check_vowels(word):
	#if len(word) == 7 and word[-1] == u'а' and word[-1] == u'\'' and ((word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о')):
	if len(word) == 6 and word[-1] == u'а' and (word[1] == u'а' or word[1] == u'о') and (word[3] == u'а' or word[3] == u'о'):
	return True
	else:
	return False
	'''
	if (word[1] == u'а' or word[1] == u'о' or word[3] == u'а' or word[3] == u'о') and word[-1] == u'а' and len(word) == 6:
	return True
	else:
	return False
	'''


	def check_unwished_consonants(word):
	b = u'йлрмн'
	pattern = r"[" + b + "]"
	consonants_letter_re = re.compile(pattern)
	consonants_array = consonants_letter_re.findall(word)
	if len(consonants_array) == 0:
	return True
	else:
	return False


	#def find_words(file_name):


	#source = read_file('statements2.txt')
	source = read_file('dict.txt')
	source = read_file('polus.txt')
	uniq_words_array = get_only_words_from_text(source)


	#res = set()
	for current_word in uniq_words_array:
	if check_vowels(current_word) == True and check_unwished_consonants(current_word) == True:
	print(current_word.encode('cp1251'))
	#res.add(current_word)
	'''
	for word in res:
	print(word.encode('cp1251'))

	word = u'казака'
	print(check_vowels(u'казака'))
	print(check_unwished_consonants(u'казака'))
	print(len(word))
	print(word[-1])
	print(word[1] == u'а' or word[1] == u'о')
	print(word[3] == u'а' or word[3] == u'о')

	#print(sys.getdefaultencoding())
	#print(locale.getpreferredencoding())
	'''