Skip to content

Instantly share code, notes, and snippets.

@ngauthier
Last active March 1, 2017 13:29
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save ngauthier/0f78598f2aaecab8f1bc to your computer and use it in GitHub Desktop.
Save ngauthier/0f78598f2aaecab8f1bc to your computer and use it in GitHub Desktop.
Scraping the Web with Ruby Code
#!/usr/bin/env ruby
# From: http://ngauthier.com/2014/06/scraping-the-web-with-ruby.html
require 'capybara'
require 'capybara/poltergeist'
require 'csv'
require 'gdbm'
class NickBot
include Capybara::DSL
def initialize(io = STDOUT)
Capybara.default_driver = :poltergeist
@io = io
end
def scrape
visit "http://ngauthier.com/"
all(".posts .post").each do |post|
article = Article.from_summary(post)
next unless article.new_record?
article.save
end
Article.each do |article|
next if article.body
visit "http://ngauthier.com#{article.url}"
has_content?(article.title) or raise "couldn't load #{url}"
article.body = find("article").text
article.save
end
CSV(@io) do |csv|
csv << ["Title", "URL", "Date", "Summary", "Body"]
Article.each do |article|
csv << [
article.title,
article.url,
article.date,
article.summary,
article.body
]
end
end
end
class Article < OpenStruct
DB = GDBM.new("articles.db")
def self.from_summary(node)
new(
title: node.find("h3 a").text,
url: node.find("h3 a")["href"],
date: node.find("h3 small").text,
summary: node.find("p.preview").text,
)
end
def self.each
DB.each do |url, json|
yield Article.new(JSON.load(json))
end
end
def save
DB[url] = to_h.to_json
end
def new_record?
DB[url].nil?
end
end
end
NickBot.new(STDOUT).scrape
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment