FloydanTheBeast/linguistics.py

## linguistics.py
from nltk import sent_tokenize, word_tokenize
from string import punctuation
from collections import defaultdict
import operator

def count_one_symbol_words(word_list):
    counter = 0
    for word in word_list:
        if len(word) == 1 and word not in punctuation:
            counter += 1
    return counter

text = open('text_for_test.txt', 'r+').read()
sents = sent_tokenize(text)

word_counter = 0
raw_word_counter = 0
most_one_symbols = 0
most_one_symbols_counter = 0

frequency_dict = defaultdict(int)

for sent in sents:
    word_list = word_tokenize(sent)
    raw_word_counter += len(word_list)
    word_counter += len([s for s in word_list if s not in punctuation])
    one_symbol_counter = count_one_symbol_words(word_list)
    if one_symbol_counter > most_one_symbols_counter:
        most_one_symbols_counter = one_symbol_counter
        most_one_symbols = sent
    for word in word_list:
        frequency_dict[word.lower()] += 1

for word in frequency_dict.keys():
    frequency_dict[word.lower()] /= raw_word_counter

average_word_frequency = round(word_counter / len(sents), 3)

if most_one_symbols:
    print('Наибольшее количество односимвольных слов в предложении:\n {}\n Их количество - {}'.format(most_one_symbols, most_one_symbols_counter))
else:
    print('В тексте нет ни одного предложения с односимвольным словом')

print('Средляя длина предложения - {}'.format(average_word_frequency))

frequency_dict = dict(sorted(frequency_dict.items(), key=operator.itemgetter(1), reverse=True))

print('10 самых частоиспользуемых слов:\n')
for word in list(frequency_dict.keys())[:10]:
    print('{}: {}'.format(word, frequency_dict[word]))
	from nltk import sent_tokenize, word_tokenize
	from string import punctuation
	from collections import defaultdict
	import operator

	def count_one_symbol_words(word_list):
	counter = 0
	for word in word_list:
	if len(word) == 1 and word not in punctuation:
	counter += 1
	return counter

	text = open('text_for_test.txt', 'r+').read()
	sents = sent_tokenize(text)

	word_counter = 0
	raw_word_counter = 0
	most_one_symbols = 0
	most_one_symbols_counter = 0

	frequency_dict = defaultdict(int)

	for sent in sents:
	word_list = word_tokenize(sent)
	raw_word_counter += len(word_list)
	word_counter += len([s for s in word_list if s not in punctuation])
	one_symbol_counter = count_one_symbol_words(word_list)
	if one_symbol_counter > most_one_symbols_counter:
	most_one_symbols_counter = one_symbol_counter
	most_one_symbols = sent
	for word in word_list:
	frequency_dict[word.lower()] += 1

	for word in frequency_dict.keys():
	frequency_dict[word.lower()] /= raw_word_counter

	average_word_frequency = round(word_counter / len(sents), 3)

	if most_one_symbols:
	print('Наибольшее количество односимвольных слов в предложении:\n {}\n Их количество - {}'.format(most_one_symbols, most_one_symbols_counter))
	else:
	print('В тексте нет ни одного предложения с односимвольным словом')

	print('Средляя длина предложения - {}'.format(average_word_frequency))

	frequency_dict = dict(sorted(frequency_dict.items(), key=operator.itemgetter(1), reverse=True))

	print('10 самых частоиспользуемых слов:\n')
	for word in list(frequency_dict.keys())[:10]:
	print('{}: {}'.format(word, frequency_dict[word]))