This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import nltk | |
import codecs | |
from urllib import urlopen | |
def print_list(mylist): | |
'''Print a list containing unicode characters.''' | |
print '[' + ', '.join( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
139942 | |
31179 | |
[Мастер, и, Маргарита, Мастер, и, Маргарита, Михаил, Булгаков, , Москва, 1984, г., Текст, печатается, в, последней, прижизнен-, ной, редакции, (, рукописи, хранятся, в, руко-, писном, отделе, Государственной, библио-, теки, СССР, имени, В., И., Ленина, ), ,, а, также, с, исправлениями, и, дополнениями, ,, сделан-, ными, под, диктовку, писателя, его, женой, ,, Е., С., Булгаковой., Содержание, глава, I., Никогда, не, разговаривайте, с, неизвестными, /, 9, глава, II., Понтий, Пилат, /, 23, глава, III., седьмое, доказательство, /, 49, глава, IV., Погоня, /, 55, глава, V., Было, дело, в, грибоедове, /, 63, глава, VI., Шизофрения, ,, как, и, было, сказано, /, 77, глава, VII., Нехорошая, квартирка, /, 87, глава, VIII., Поединок, между, профессором, и, поэтом, /, 99, глава, IX., Коровьевские, штуки, /, 109, глава, X., Вести, из, Ялты, /, 119, глава, XI., раздвоение, ивана, /, 131, глава, XII., Черная, магия, и, ее, разоблачение, /, 135, глава, XIII., Явление, героя, /, 151, глава, XIV., слава, петуху, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# possible endings of diminutive stems | |
diminutive_endings = ( | |
# first degree of expressiveness | |
#u"к", | |
u"ик", | |
u"чик", | |
u"ок", #u"ек", | |
u"ец", u"иц", | |
u"енок", u"онок", u"еныш", | |
u"инк", u"инок", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
лиц 202 | |
буфетчик 72 | |
наконец 66 | |
ник 66 | |
лестниц 44 | |
аннушк 41 | |
улиц 34 | |
милиц 33 | |
пок 33 | |
маленьк 31 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import SnowballStemmer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stemmer = SnowballStemmer("russian") | |
stemlist = [] | |
for item in tokens: | |
barestem = stemmer.stem(unicode(item)) | |
stemlist.append(barestem) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# possible endings of diminutive stems | |
diminutive_endings = ( | |
# first degree of expressiveness | |
#u"к", | |
u"ик", | |
u"чик", | |
u"ок", #u"ек", | |
u"ец", u"иц", | |
u"енок", u"онок", u"еныш", | |
u"инк", u"инок", |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diminutives = [] | |
for word in stemlist: | |
if word.endswith(diminutive_endings): | |
diminutives.append(word) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import FreqDist |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dim_fd = FreqDist(diminutives) |
OlderNewer