View gist:1588878
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from __future__ import division | |
for genre in nltk.corpus.brown.categories(): | |
words = nltk.corpus.brown.words(categories = genre) | |
print genre +' - ' + str(round((len(set(words))/len(words)),6)*100) + '%' |
View gist:1589239
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def freq_non_stopwords(text): | |
stopwords = nltk.corpus.stopwords.words('english') | |
clean_list = [w for w in text if w.lower() not in stopwords] #убираем частотные слова | |
freqdist = nltk.probability.FreqDist(clean_list) | |
return freqdist.keys()[:50] #возвращаем 50 первых нечастотных слов |
View gist:1589515
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def top_bigrams(text): | |
fdist = nltk.probability.FreqDist(nltk.bigrams(text)) #формируем список кортежей биграмм | |
stopwords = nltk.corpus.stopwords.words('english') #формируем стоплист | |
top_list = [(x,y) for x,y in fdist.keys() if x.isalpha() and y.isalpha() and x not in stopwords and y not in stopwords] #показываем только если элементы кортежа - слова и невходят в стоплист | |
return top_list |
View gist:1593588
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def word_freq(word, section): | |
freq = nltk.probability.FreqDist(nltk.corpus.brown.words(categories = section)) | |
word_frequency = freq[word] | |
return word_frequency |
View gist:1593818
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d = nltk.corpus.cmudict.dict() #получаем объект в виде словаря для удобного доступа | |
def count_syllables(text): #вводим текст как список слов | |
syll_text = [] #исходный массив где будут копиться слоги | |
for word in text: | |
syll_text.extend(d[word][0]) #к исходному массиву добавляем первый элемент (в случае нескольких произношений)с помощью метода extend | |
return len(syll_text)# ву-а-ля |
View gist:1616022
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import matplotlib.pyplot as plt | |
words = nltk.corpus.brown.words() #выбираем все слова из корпуса | |
def zipf_law(words): | |
freq_dist = nltk.FreqDist(words)#считаем кол-во вхождений | |
xaxis = [] | |
yaxis = [] | |
View gist:1616220
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import matplotlib.pyplot as plt | |
import random | |
random_string = '' #инициализируем переменную | |
#превращаем ее в строку случайных символов | |
while len(random_string) < 9964284: #кол-во букв в брауновском корпусе | |
random_string += random.choice("abcdefgklmnopqrstuvwxyz ") |
View gist:1627060
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Суть алгоритма такова: из текста мы делаем множество биграмм - кортежей вида (x,y). | |
#Затем мы берем начальное слово и выбираем другое случайное, идущее с ним в биграмме. | |
#К этому случайному добавляем очередное случайное из биграммы и т.л. | |
#Случайность выбора помогает не создавать "петлей" | |
import nltk | |
import random | |
def generate_model(cfdist, word, num=15): | |
for i in range(num): |
View gist:1627107
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Суть алгоритма такова: из текста мы делаем множество биграмм - кортежей вида (x,y). | |
#Затем мы берем начальное слово и выбираем другое случайное, идущее с ним в биграмме. | |
#К этому случайному добавляем очередное случайное из биграммы и т.л. | |
#Случайность выбора помогает не создавать "петлей" | |
#Слова берутся из корпуса смешанного из двух жанров | |
import nltk | |
import random | |
def generate_model(cfdist, word, num=15): |
View video_cont.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? function video_preview() | |
{ | |
$this->load->model('text/text_model'); | |
$params = array( | |
'node_tree_id' => 22,#заменить на требуемое | |
'from_subfolders' => 1, | |
'full_text' => 1, | |
'limit' => 4 | |
); |
OlderNewer