Skip to content

Instantly share code, notes, and snippets.

@waaa
Created September 17, 2014 10:07
Show Gist options
  • Save waaa/38db9a746cfb27f27356 to your computer and use it in GitHub Desktop.
Save waaa/38db9a746cfb27f27356 to your computer and use it in GitHub Desktop.
# coding: utf-8
require 'mechanize'
class AvitoParser
# Base method in which we fetch seling engine with our search query
# and go through given number of pages with god blessed regexps
def fetch_engine(search_query, number_of_pages, city)
@search_query = search_query
host = "http://avito.ru"
items_rgxp = /\n<div class=\"t_i_i t_i(.*?)<\/div> <\/div> <\/div>/m
next_page_rgxp = /<a class=\"next\" href=\"(.*)\">Следующая/m
#fetching machine
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
raw = a.get("#{host}/#{city}?q=#{search_query}") # Mechanize::Page object
page = raw.body.to_s.encode("UTF-8",raw.encoding,:invalid=>:replace,:undef=>:replace)
@goods = [] # it will be final set of parsed goods
counter = 0
counter.upto(number_of_pages) {
items = page.scan(items_rgxp) # we'll get array of html-nodes
items.each do |item|
item = item.first
parsed_item = parse_item(item) # it's hash with such data as price, date and title
if parsed_item
@goods << parsed_item
speak(parsed_item[:date], counter) if parsed_item[:date] != @date
@date = parsed_item[:date]
end
end
link_to_next_page = page.scan(next_page_rgxp).first.first
raw = a.get(host + link_to_next_page) # go further
page = raw.body.to_s.encode("UTF-8",raw.encoding,:invalid=>:replace,:undef=>:replace)
counter += 1
}
@goods
end
def parse_item item
title_rgxp = /\" title=\"(.*?)\">\n/m
price_rgxp = /\n <span>.*<\/span> <span>руб.<\/span>/m
date_rgxp = /<div class=\"t_i_date\">\n(.*)\n?/
title = item.scan(title_rgxp).last.first
# we don't need ad which title doesn't contain our query itself
return if title.scan(@search_query + ' ').empty?
price = item.scan(price_rgxp)
if price.empty?
price = 0 # sometimes seller doesn't set any price, we don't care
else
# price is a string like "8 000 руб."
# here we make it an integer like 8000
price = price.first.scan(/\d/).inject{|x,d|x+=d}.to_i
end
date_str = item.scan(date_rgxp).first.first
date = parse_date(date_str)
{:title => title, :price => price, :date => date}
end
# date may look like "12 авг." or "16 июня", we need to parse it
def parse_date str
if !str.scan(/сегодня/i).empty?
date = Date.today
elsif !str.scan(/вчера/i).empty?
date = Date.today-1
else
arr = str.scan(/(\d{1,2})\s(.*)/).first
case arr[1]
when 'авг.'
month = 8
when 'июля'
month = 7
when 'июня'
month = 6
when 'мая'
month = 5
end
date = Date.strptime("#{arr[0]}.#{month}","%d.%m")
end
end
def calculate_the_mean_value
sum = 0
days_sum = 0
@goods.each do |item|
sum += item[:price] # counting the sum of prices of all items
date_count = Date.today - item[:date] # counting how old this ad is
date_count = 1 if date_count == 0 # if it's today's ad, we get zero, which we wouldn't like
days_sum += date_count # counting sum of periods of all items
end
mean_price = sum.to_f/@goods.count
mean_period = days_sum.to_f/@goods.count
@mean_values =
{
:total_count => @goods.count,
:mean_price => mean_price,
:mean_period => mean_period
}
end
def speak date=nil, page=nil
if date
puts "processing #{date} on page #{page + 1}.."
elsif @mean_values
puts "========================================="
puts "We've got #{@mean_values[:total_count]} items,"
puts "their mean price is #{@mean_values[:mean_price]} and mean period of sale is #{@mean_values[:mean_period]}"
end
end
def main(search_query, number_of_pages=100, city='moskva')
fetch_engine(search_query, number_of_pages, city)
calculate_the_mean_value
speak
end
end
AvitoParser.new.main("велосипед")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment