This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from __future__ import division | |
for genre in nltk.corpus.brown.categories(): | |
words = nltk.corpus.brown.words(categories = genre) | |
print genre +' - ' + str(round((len(set(words))/len(words)),6)*100) + '%' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def freq_non_stopwords(text): | |
stopwords = nltk.corpus.stopwords.words('english') | |
clean_list = [w for w in text if w.lower() not in stopwords] #убираем частотные слова | |
freqdist = nltk.probability.FreqDist(clean_list) | |
return freqdist.keys()[:50] #возвращаем 50 первых нечастотных слов |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def top_bigrams(text): | |
fdist = nltk.probability.FreqDist(nltk.bigrams(text)) #формируем список кортежей биграмм | |
stopwords = nltk.corpus.stopwords.words('english') #формируем стоплист | |
top_list = [(x,y) for x,y in fdist.keys() if x.isalpha() and y.isalpha() and x not in stopwords and y not in stopwords] #показываем только если элементы кортежа - слова и невходят в стоплист | |
return top_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def word_freq(word, section): | |
freq = nltk.probability.FreqDist(nltk.corpus.brown.words(categories = section)) | |
word_frequency = freq[word] | |
return word_frequency |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
d = nltk.corpus.cmudict.dict() #получаем объект в виде словаря для удобного доступа | |
def count_syllables(text): #вводим текст как список слов | |
syll_text = [] #исходный массив где будут копиться слоги | |
for word in text: | |
syll_text.extend(d[word][0]) #к исходному массиву добавляем первый элемент (в случае нескольких произношений)с помощью метода extend | |
return len(syll_text)# ву-а-ля |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import matplotlib.pyplot as plt | |
words = nltk.corpus.brown.words() #выбираем все слова из корпуса | |
def zipf_law(words): | |
freq_dist = nltk.FreqDist(words)#считаем кол-во вхождений | |
xaxis = [] | |
yaxis = [] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import matplotlib.pyplot as plt | |
import random | |
random_string = '' #инициализируем переменную | |
#превращаем ее в строку случайных символов | |
while len(random_string) < 9964284: #кол-во букв в брауновском корпусе | |
random_string += random.choice("abcdefgklmnopqrstuvwxyz ") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Суть алгоритма такова: из текста мы делаем множество биграмм - кортежей вида (x,y). | |
#Затем мы берем начальное слово и выбираем другое случайное, идущее с ним в биграмме. | |
#К этому случайному добавляем очередное случайное из биграммы и т.л. | |
#Случайность выбора помогает не создавать "петлей" | |
import nltk | |
import random | |
def generate_model(cfdist, word, num=15): | |
for i in range(num): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Суть алгоритма такова: из текста мы делаем множество биграмм - кортежей вида (x,y). | |
#Затем мы берем начальное слово и выбираем другое случайное, идущее с ним в биграмме. | |
#К этому случайному добавляем очередное случайное из биграммы и т.л. | |
#Случайность выбора помогает не создавать "петлей" | |
#Слова берутся из корпуса смешанного из двух жанров | |
import nltk | |
import random | |
def generate_model(cfdist, word, num=15): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<? function video_preview() | |
{ | |
$this->load->model('text/text_model'); | |
$params = array( | |
'node_tree_id' => 22,#заменить на требуемое | |
'from_subfolders' => 1, | |
'full_text' => 1, | |
'limit' => 4 | |
); |
OlderNewer