Skip to content

Instantly share code, notes, and snippets.

@bunyk
Created August 11, 2013 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bunyk/6205483 to your computer and use it in GitHub Desktop.
Save bunyk/6205483 to your computer and use it in GitHub Desktop.
Статистика слів в книжках.
#! /usr/bin/python3
from glob import glob
from functools import reduce
from collections import Counter
import re
import sys
TOKENIZE_REGEX = re.compile(r'[\w`-]+')
def tokenize(text):
return (m.group(0) for m in TOKENIZE_REGEX.finditer(text))
def most_common_set(d, count):
return set(p[0] for p in d.most_common(count))
def main():
dicts = {
f: count(f)
for f in glob('books/*')
}
stats = {
f: {
1000: most_common_set(d, 1000),
100: most_common_set(d, 100),
'total': sum(d.values()),
'unique': len(d),
0.5: count_for_percentile(d, 0.5),
0.9: count_for_percentile(d, 0.9),
}
for f, d in dicts.items()
}
most_common = reduce(
lambda a, b: a.intersection(b),
(v[1000] for v in stats.values())
)
print('Слова які входять в 1000 найчастіше вживаних '
'в КОЖНІЙ книжці (загалом %s):' % len(most_common)
)
print(', '.join(most_common))
for book, stats in stats.items():
print()
print(book)
print('=' * len(book))
print('Загалом слів: %s' % stats['total'])
print('Різних слів: %s' % stats['unique'])
print('Слів для розуміння половини тексту: %s' % stats[0.5])
print('Слів для розуміння 90%% тексту: %s' % stats[0.9])
unique = stats[100].difference(most_common)
print('\nСлова з 100 найчастіше вживаних в кожній книжці, '
'що не входять до спільного списку (загалом %s):' % len(unique))
print(', '.join(unique))
def count(filename):
dictionary = Counter()
with open(filename) as stream:
for line in stream:
dictionary.update(tokenize(line))
return dictionary
def count_for_percentile(d, percentile):
total = sum(d.values())
res, sum_frequency = 0, 0
for w, frequency in d.most_common():
res += 1
sum_frequency += frequency
if sum_frequency / total > percentile:
return res
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment