Skip to content

Instantly share code, notes, and snippets.

@iantropov
Last active December 10, 2015 23:38
Show Gist options
  • Save iantropov/4510390 to your computer and use it in GitHub Desktop.
Save iantropov/4510390 to your computer and use it in GitHub Desktop.
First implementation of parser for Yandex`s result pages
from lxml import html
class YandexRespParser(object):
__RESULTS_ON_PAGE = 10 #Its possible to evaluate this number from first result page
def __create_rep(self, result_element):
rep = {}
rep["index"] = int(result_element.xpath('.//b[@class="b-serp-item__number"]')[0].text_content())
rep["href"] = str(result_element.xpath('.//a[@class="b-serp-item__title-link"]')[0].get("href"))
rep["domain"] = str(result_element.xpath('.//a[@class="b-serp-url__link"]')[0].text_content())
rep["title"] = str(result_element.xpath('.//a[@class="b-serp-item__title-link"]')[0].text_content())
rep["text"] = str(result_element.xpath('.//div[@class="b-serp-item__text"]')[0].text_content())
rep["copy"] = str(result_element.xpath('.//a[@class="b-serp-item__links-link"]')[0].get("href"))
return rep
def __get_reps_from_page(self, page_number, results_from_page=10):
page = html.parse("http://ya.ru/yandsearch?text={0}&lr={1}&p={2}".format(self.query, self.lr, page_number))
result_elements = page.xpath('//li[@class="b-serp-item i-bem"]')
reps_count = min(results_from_page, len(result_elements))
return [self.__create_rep(result_elements[i]) for i in range(reps_count)]
def get_reps (self, count):
number_of_pages = count / self.__RESULTS_ON_PAGE
last_results = count % self.__RESULTS_ON_PAGE
first_search_results = sum(map(self.__get_reps_from_page, range(number_of_pages)), [])
last_search_results = self.__get_reps_from_page(number_of_pages, last_results)
return first_search_results + last_search_results
def __init__(self, query, lr):
self.query = query
self.lr = lr
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment