Created
December 23, 2019 13:37
-
-
Save HunnyJummani/f5676d3c0bc1ac7e91e5ffc809bcf03e to your computer and use it in GitHub Desktop.
Infinite scrolling web scraping in Kimurai - Ruby on Rails
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'kimurai' | |
class WebScrapper < Kimurai::Base | |
@name = "web_scrapper_spider" | |
@engine = :selenium_chrome | |
@start_urls = ["https://metaruby.com/"] | |
@config = { | |
user_agent: "Chrome/68.0.3440.84" | |
} | |
attr_accessor :blogs | |
def parse(response, url:, data: { }) | |
@blogs = [] | |
# Get all rows inside the table using XPATH | |
posts_headers_path = "//table[@class='topic-list ember-view']//tbody//tr" | |
count = response.xpath(posts_headers_path).count | |
loop do | |
# Scroll Until it reaches thge end. | |
browser.execute_script("window.scrollBy(0,10000)") ; sleep 2 | |
response = browser.current_response | |
new_count = response.xpath(posts_headers_path).count | |
if count == new_count | |
# Parse & store the data. | |
parse_data(response) | |
logger.info "> Pagination is done" and break | |
else | |
count = new_count | |
logger.info "> Continue scrolling, current count is #{count}..." | |
end | |
end | |
logger.info "> Data saved to results.json" | |
end | |
def parse_data(response) | |
response.xpath("//table[@class='topic-list ember-view']//tbody//tr").each do |tr| | |
scrapped_data = { | |
title: tr.at('td[1]//span').text, | |
category: tr.at('td[1]//div//span').text, | |
date: tr.at('td[3]').text.strip | |
} | |
blogs << scrapped_data | |
save_to "results.json", scrapped_data.as_json, format: :json | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for sharing!
But this doesn't work with Ruby 3.
Can you share the update?