This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<link rel="stylesheet" href="/static/js/fancybox-1.3.1/jquery.fancybox-1.3.1.css" media="screen" type="text/css" /> | |
<script type='text/javascript' src='/static/js/jquery.js'></script> | |
<script type='text/javascript' src='/static/js/fancybox-1.3.1/jquery.fancybox-1.3.1.js'></script> | |
<script type='text/javascript'> | |
$(document).ready(function(){ | |
$('#main-video-block #preview_texts a').mouseenter(function () { | |
if (!$(this).hasClass('current')) { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
def find_language(word): | |
#загружаем декларацию | |
from nltk.corpus import udhr | |
#заготавливаем пустой массив | |
result_lang = [] | |
#перебираем тексты с латинской транскрипцией |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
#имплортируем wordnet | |
from nltk.corpus import wordnet as wn | |
#инициализируем переменные для аггегации | |
hyponyms_number = 0 | |
synsets_with_hyponyms = 0 | |
for synset in list(wn.all_synsets('n')): # перебираем все синсеты с существительными |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
def stand_dev(mylist, random_selection = 0): | |
mean = sum(mylist)/len(mylist) | |
#если стоит флаг случайной выборки, уменьшаем знаменатель на 1 | |
if random_selection != 0: | |
denominator = len(mylist)-1 | |
else: | |
denominator = len(mylist) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#NLTK упр 21. Выводим слова из сайта, которых нет в корпусе слов (типа неизвестные) | |
def unknown(url): | |
#загружаем фигню | |
import nltk, urllib | |
#читаем url | |
html = urllib.urlopen(url).read() | |
raw = nltk.clean_html(html) | |
tokens = set(nltk.word_tokenize(raw)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#NLTk упр 24. Функция кодирует английский текст, заменяя буквы на всякую фигню | |
# с помощью регулярок | |
import nltk,re | |
def hackerize(text): | |
#делаем буквы строчными | |
text = text.lower() | |
#список кортежей "регулярка-замена" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from math import sqrt | |
def heron(a,b,c): | |
perimeter = a+b+c | |
halfp = perimeter/2 | |
square = sqrt(halfp*(halfp-a)*(halfp-b)*(halfp-c)) | |
return square |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk, re | |
def piglatinize(word): | |
plword = re.findall(r'^(y|qu|[bcdfghgklmprstvwzx]*)(.*)', word) | |
if plword != []: | |
plword = plword[0][1] + plword[0][0] | |
else: | |
plword = word | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#этот скрипт удаляет тэги и нормализует пробелы | |
import nltk,re | |
def normalize(text): | |
#список кортежей "регулярка-замена" | |
replaces = [('<[\/\!]*?[^<>]*?>',''),('\s+',' ')] | |
normtext = [] | |
#цикл пробегает весь список, применяя каждый кортеж замен к тексту | |
for target,replace in replaces: | |
normtext = re.sub(target, replace, text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import lxml, lxml.html, re, random | |
from lxml import etree | |
text = lxml.html.parse('C:/Users/user/Desktop/темп/Новая папка/Билеты1.html').getroot() | |
ques = [] | |
for child in text[0]: | |
html_text = re.sub("^\s+|\n|\r|\t|\s+$", '', lxml.html.tostring(child)) | |
if child.tag == 'h1': |