bunyk/book_stats.py

## book_stats.py
#! /usr/bin/python3

from glob import glob
from functools import reduce
from collections import Counter
import re
import sys

TOKENIZE_REGEX = re.compile(r'[\w`-]+')

def tokenize(text):
    return (m.group(0) for m in TOKENIZE_REGEX.finditer(text))

def most_common_set(d, count):
    return set(p[0] for p in d.most_common(count))

def main():
    dicts = {
        f: count(f)
        for f in glob('books/*')
    }
    stats = {
        f: {
            1000: most_common_set(d, 1000),
            100: most_common_set(d, 100),
            'total': sum(d.values()),
            'unique': len(d),
            0.5: count_for_percentile(d, 0.5),
            0.9: count_for_percentile(d, 0.9),
        }
        for f, d in dicts.items()
    }
    most_common = reduce(
        lambda a, b: a.intersection(b),
        (v[1000] for v in stats.values())
    )
    print('Слова які входять в 1000 найчастіше вживаних '
        'в КОЖНІЙ книжці (загалом %s):' % len(most_common)
    )
    print(', '.join(most_common))

    for book, stats in stats.items():
        print()
        print(book)
        print('=' * len(book))

        print('Загалом слів: %s' % stats['total'])
        print('Різних слів: %s' % stats['unique'])
        print('Слів для розуміння половини тексту: %s' % stats[0.5])
        print('Слів для розуміння 90%% тексту: %s' % stats[0.9])


        unique = stats[100].difference(most_common)
        print('\nСлова з 100 найчастіше вживаних в кожній книжці, '
            'що не входять до спільного списку (загалом %s):' % len(unique))
        print(', '.join(unique))


def count(filename):
    dictionary = Counter()

    with open(filename) as stream:
        for line in stream:
            dictionary.update(tokenize(line))

    return dictionary

def count_for_percentile(d, percentile):
    total = sum(d.values())
    res, sum_frequency = 0, 0
    for w, frequency in d.most_common():
        res += 1
        sum_frequency += frequency
        if sum_frequency / total > percentile:
            return res

if __name__ == '__main__':
    main()
	#! /usr/bin/python3

	from glob import glob
	from functools import reduce
	from collections import Counter
	import re
	import sys

	TOKENIZE_REGEX = re.compile(r'[\w`-]+')

	def tokenize(text):
	return (m.group(0) for m in TOKENIZE_REGEX.finditer(text))

	def most_common_set(d, count):
	return set(p[0] for p in d.most_common(count))

	def main():
	dicts = {
	f: count(f)
	for f in glob('books/*')
	}
	stats = {
	f: {
	1000: most_common_set(d, 1000),
	100: most_common_set(d, 100),
	'total': sum(d.values()),
	'unique': len(d),
	0.5: count_for_percentile(d, 0.5),
	0.9: count_for_percentile(d, 0.9),
	}
	for f, d in dicts.items()
	}
	most_common = reduce(
	lambda a, b: a.intersection(b),
	(v[1000] for v in stats.values())
	)
	print('Слова які входять в 1000 найчастіше вживаних '
	'в КОЖНІЙ книжці (загалом %s):' % len(most_common)
	)
	print(', '.join(most_common))

	for book, stats in stats.items():
	print()
	print(book)
	print('=' * len(book))

	print('Загалом слів: %s' % stats['total'])
	print('Різних слів: %s' % stats['unique'])
	print('Слів для розуміння половини тексту: %s' % stats[0.5])
	print('Слів для розуміння 90%% тексту: %s' % stats[0.9])


	unique = stats[100].difference(most_common)
	print('\nСлова з 100 найчастіше вживаних в кожній книжці, '
	'що не входять до спільного списку (загалом %s):' % len(unique))
	print(', '.join(unique))


	def count(filename):
	dictionary = Counter()

	with open(filename) as stream:
	for line in stream:
	dictionary.update(tokenize(line))

	return dictionary

	def count_for_percentile(d, percentile):
	total = sum(d.values())
	res, sum_frequency = 0, 0
	for w, frequency in d.most_common():
	res += 1
	sum_frequency += frequency
	if sum_frequency / total > percentile:
	return res

	if __name__ == '__main__':
	main()