-
-
Save beautyfree/01e3dd705add4cd5e2c7 to your computer and use it in GitHub Desktop.
Simple SEO-related text metrics.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
from collections import Counter | |
from pymorphy2 import MorphAnalyzer | |
morph = MorphAnalyzer() | |
def words(text): | |
return re.findall('[а-яА-Яa-zA-Z-]+', text.lower()) | |
def norm(word): | |
return morph.parse(word)[0].normal_form | |
def stats(text, stopwords): | |
words_ = list(map(norm, words(text))) | |
uniq = set(words_) | |
total = len(words_) | |
print('Total words: ', total) | |
print('Unique words: ', len(uniq)) | |
print('Stop words: ', len([1 for k in uniq if k in stopwords])) | |
nostop = [w for w in words_ if w not in stopwords] | |
cnt = Counter(nostop) | |
print('Keyword\tCount\tFreq') | |
for k, v in cnt.most_common(20): | |
print('{:<20}{:>5}{:>10.4f}'.format(k, v, v/total)) | |
s = sum([v for _, v in cnt.most_common(4)]) | |
print(s, s/total) | |
if __name__ == '__main__' and len(sys.argv)>1: | |
with open('/home/alt/tmp/sw.txt', 'r') as f: | |
stopwords = set(f.read().splitlines()) | |
with open(sys.argv[1], 'r') as f: | |
text = f.read() | |
stats(text, stopwords) | |
else: | |
print('Usage: <text.txt>') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
алтухов | |
большой | |
бы | |
быть | |
в | |
весь | |
вот | |
все | |
всей | |
вы | |
говорить | |
год | |
да | |
для | |
до | |
еще | |
же | |
знать | |
и | |
из | |
к | |
как | |
который | |
мочь | |
мы | |
на | |
наш | |
не | |
него | |
нее | |
нет | |
них | |
но | |
о | |
один | |
она | |
они | |
оно | |
оный | |
от | |
ото | |
по | |
с | |
свой | |
себя | |
сказать | |
та | |
такой | |
только | |
тот | |
ты | |
у | |
что | |
это | |
этот | |
я |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment