Created
December 21, 2018 14:38
-
-
Save FloydanTheBeast/d78033ce1fd359bfde6ea1d7047cdf5d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import sent_tokenize, word_tokenize | |
from string import punctuation | |
from collections import defaultdict | |
import operator | |
def count_one_symbol_words(word_list): | |
counter = 0 | |
for word in word_list: | |
if len(word) == 1 and word not in punctuation: | |
counter += 1 | |
return counter | |
text = open('text_for_test.txt', 'r+').read() | |
sents = sent_tokenize(text) | |
word_counter = 0 | |
raw_word_counter = 0 | |
most_one_symbols = 0 | |
most_one_symbols_counter = 0 | |
frequency_dict = defaultdict(int) | |
for sent in sents: | |
word_list = word_tokenize(sent) | |
raw_word_counter += len(word_list) | |
word_counter += len([s for s in word_list if s not in punctuation]) | |
one_symbol_counter = count_one_symbol_words(word_list) | |
if one_symbol_counter > most_one_symbols_counter: | |
most_one_symbols_counter = one_symbol_counter | |
most_one_symbols = sent | |
for word in word_list: | |
frequency_dict[word.lower()] += 1 | |
for word in frequency_dict.keys(): | |
frequency_dict[word.lower()] /= raw_word_counter | |
average_word_frequency = round(word_counter / len(sents), 3) | |
if most_one_symbols: | |
print('Наибольшее количество односимвольных слов в предложении:\n {}\n Их количество - {}'.format(most_one_symbols, most_one_symbols_counter)) | |
else: | |
print('В тексте нет ни одного предложения с односимвольным словом') | |
print('Средляя длина предложения - {}'.format(average_word_frequency)) | |
frequency_dict = dict(sorted(frequency_dict.items(), key=operator.itemgetter(1), reverse=True)) | |
print('10 самых частоиспользуемых слов:\n') | |
for word in list(frequency_dict.keys())[:10]: | |
print('{}: {}'.format(word, frequency_dict[word])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment