Skip to content

Instantly share code, notes, and snippets.

@menzenski
Created October 18, 2013 05:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save menzenski/7036795 to your computer and use it in GitHub Desktop.
Save menzenski/7036795 to your computer and use it in GitHub Desktop.
Automate a search of slovari.yandex.ru
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import nltk
from nltk.stem import SnowballStemmer
from nltk import FreqDist
import codecs
import glob
from diminutivefinder_04_asclass import DiminutiveFinder
import matplotlib.pyplot as plt
from collections import OrderedDict
import urllib
import sgmllib
import lxml.html
def print_list(mylist):
'''Print a list containing unicode characters.'''
print '[' + ', '.join(
"" + word.encode('utf8') + "" for word in mylist) + ']'
verified_dims = []
def yandex_search(search_term):
search_url_begin = 'http://slovari.yandex.ru/'
search_url_end = '/en/#lingvo/'
search_url_whole = search_url_begin + search_term.encode(
'utf8') + search_url_end
results_page = urllib.urlopen(search_url_whole)
results_html = results_page.read()
# parsing the html
# results_summary = lxml.hmtl.find_title(
# results_html, "уменьшительная форма")
dim_tag = '\xd1\x83\xd0\xbc\xd0\xb5\xd0\xbd\xd1\x8c\xd1\x88'
if dim_tag in results_html:
verified_dims.append(search_term)
def main():
data = codecs.open("TMaM_CompleteText.txt", encoding="utf8")
textfile = data.read()
dims = DiminutiveFinder(textfile)
dims.find_diminutives(textfile)
distinct_dim_stems = set(dims.diminutives)
for stem in distinct_dim_stems:
yandex_search(stem)
print_list(verified_dims)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment