Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nileshtrivedi/723980 to your computer and use it in GitHub Desktop.
Save nileshtrivedi/723980 to your computer and use it in GitHub Desktop.
Scraping fun with Google, Flipkart and Ruby
require 'net/http'
require 'uri'
require 'csv'
google_url = "http://www.google.com/search?q=%22public+wishlist%22+site:flipkart.com&hl=en&site=webhp&prmd=iv&ei=ZYL2TOr3KYLyrQfv4dzWBg&start=NNNN&sa=N&fp=1b624545158a7512&tch=1&ech=1&psi=MX_2TOSlL82BrQegxqDYBg129122283549813"
File.open("flipkart-public-wishlists.txt","w") { |f|
(0..54).to_a.each { |page|
puts "scraping page #{page} from google web search"
start = page * 10
url = google_url.sub(/NNNN/,start.to_s)
uri = URI.parse(url)
sleep(2) #dear google, please be nice :)
response = Net::HTTP.get_response(uri)
matches = response.body.scan(/www.flipkart.com\/wishlist[^ +\\]*/).uniq #look for this pattern
matches.each { |m| f.puts m }
}
puts "Done"
}
wishlists = CSV.read("flipkart-public-wishlists.txt").collect { |row| row.first }.collect { |url| "http://#{url}" }
File.open("flipkart-wishlist-raw-data.csv","w") { |f|
wishlists.each_with_index { |w,count|
sleep(2) #you too, flipkart :)
puts "trying wishlist #{count}: #{w}"
uri = URI.parse(w)
response = Net::HTTP.get_response(uri)
items = response.body.scan(/<div class="search_result_title">\s+<a href="([^"]+)" title="([^"]+)"/) #array of two items, first is url of the book, second is book title
prices = response.body.scan(/<span class="search_results_price">Price: (<span class="search_results_list_price">[^<]*<\/span>|Not Available)?[^<]*(<font color='#993300'><b>Rs. (\d+)<\/b>)?/) #array of 3 items
puts "Sizes do not match" if items.size != prices.size
items.each_with_index { |it,c|
f.puts "#{w},#{it[0]},\"#{it[1]}\",#{prices[c][0]},#{prices[c][1]},#{prices[c][2]}" #dump the raw data in a CSV file. We'll use OpenOffice's pivot tables to do the analysis
}
}
puts "Done"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment