Skip to content

Instantly share code, notes, and snippets.

@sairam
Last active April 24, 2016 06:37
Show Gist options
  • Save sairam/2db759478aa0e8c40686fd683449de09 to your computer and use it in GitHub Desktop.
Save sairam/2db759478aa0e8c40686fd683449de09 to your computer and use it in GitHub Desktop.
Ruby Scripts for Camera Price Crawlers 2011
# Download all Camera Prices from Flipkart Website
# Code from the year 2011
require 'rubygems'
require 'mechanize'
require 'nokogiri'
require 'open-uri'
sitemapurl = 'http://www.flipkart.com/camera/'
baseurl = 'http://www.flipkart.com/cameras/'
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
file = open('alldata.yaml','w+')
processed = {}
a.get(sitemapurl) do |page|
page.links_with(:href => /camera\/2\-/ ).each do |link|
unless processed.has_key? link.href
processed[link.href]=link.text.strip
puts 'Loading %-30s %s ' % [link.href, link.text.strip]
begin
products_page = a.click link
products_page.links_with(:href => /cameras\//).each do |plink|
next if processed.has_key? plink.href
processed[plink.href] = plink.text.strip
puts 'Loading %-30s %s ' % [plink.href, plink.text.strip]
begin
data = a.click plink
file.puts data.at("//span[@id='fk-mprod-our-id']").text.split(/\n/).join(' ').strip + "\t" + plink.text.strip
# exit
rescue => e
$stderr.puts "#{e.class}: #{e.message}"
end
end
end
end
end
end
file.close
require 'rubygems'
require 'mechanize'
require 'nokogiri'
require 'open-uri'
sitemapurl = 'http://www.nikon.co.in/sitemap.php'
baseurl = 'http://www.nikon.co.in/'
a = Mechanize.new { |agent|
agent.user_agent_alias = 'Mac Safari'
}
file = open('alldata.yaml','w+')
processed = {}
a.get(sitemapurl) do |page|
page.links_with(:href => /products.php/ ).each do |link|
unless processed.has_key? link.href
processed[link.href]=link.text.strip
puts 'Loading %-30s %s ' % [link.href, link.text.strip]
begin
products_page = a.click link
products_page.links_with(:href => /productitem.php/).each do |plink|
next if processed.has_key? plink.href
processed[plink.href] = plink.text.strip
puts 'Loading %-30s %s ' % [plink.href, plink.text.strip]
begin
data = a.click plink
file.puts data.at("//div[@class='price']").text.sub(/\r/,' ').split(/\n/).join(' ') + "\n"
rescue => e
$stderr.puts "#{e.class}: #{e.message}"
end
end
end
end
end
end
file.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment