Last active
November 16, 2015 22:57
-
-
Save otykhonruk/f431a5fd390d59dbba59 to your computer and use it in GitHub Desktop.
Simple SEO-related text metrics.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from collections import Counter | |
from pymorphy2 import MorphAnalyzer | |
morph = MorphAnalyzer() | |
def words(text): | |
return re.findall('[а-яА-Яa-zA-Z-]+', text.lower()) | |
def norm(word): | |
return morph.parse(word)[0].normal_form | |
def stats(text, stopwords): | |
words_ = list(map(norm, words(text))) | |
uniq = set(words_) | |
total = len(words_) | |
print('Total words: ', total) | |
print('Unique words: ', len(uniq)) | |
print('Stop words: ', len([1 for k in uniq if k in stopwords])) | |
nostop = [w for w in words_ if w not in stopwords] | |
cnt = Counter(nostop) | |
print('Keyword\tCount\tFreq') | |
for k, v in cnt.most_common(20): | |
print('{:<20}{:>5}{:>10.4f}'.format(k, v, v/total)) | |
s = sum([v for _, v in cnt.most_common(4)]) | |
print(s, s/total) | |
if __name__ == '__main__' and len(sys.argv)>1: | |
with open('/home/alt/tmp/sw.txt', 'r') as f: | |
stopwords = set(f.read().splitlines()) | |
with open(sys.argv[1], 'r') as f: | |
text = f.read() | |
stats(text, stopwords) | |
else: | |
print('Usage: <text.txt>') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
алтухов | |
большой | |
бы | |
быть | |
в | |
весь | |
вот | |
все | |
всей | |
вы | |
говорить | |
год | |
да | |
для | |
до | |
еще | |
же | |
знать | |
и | |
из | |
к | |
как | |
который | |
мочь | |
мы | |
на | |
наш | |
не | |
него | |
нее | |
нет | |
них | |
но | |
о | |
один | |
она | |
они | |
оно | |
оный | |
от | |
ото | |
по | |
с | |
свой | |
себя | |
сказать | |
та | |
такой | |
только | |
тот | |
ты | |
у | |
что | |
это | |
этот | |
я |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment