Skip to content

Instantly share code, notes, and snippets.

@derekmartinla
Created January 4, 2017 21:22
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save derekmartinla/f1347f05207220c52a1ea65c3d3229cf to your computer and use it in GitHub Desktop.
Save derekmartinla/f1347f05207220c52a1ea65c3d3229cf to your computer and use it in GitHub Desktop.
Crawl a site and update Google Sheet with results
require 'spidr' # gem install spidr
require 'nokogiri'
require 'google_drive'
GOOGLE_SHEET_ID = "1k26clMVYUX5PLag5K2Ku4osgZcTtWp-wlss1B6CC_KE" # replace with your Google Sheet Id
CRAWL_SITE = "https://www.mywebsite.com"
OpenSSL::SSL.send(:remove_const, :VERIFY_PEER)
OpenSSL::SSL.const_set(:VERIFY_PEER, OpenSSL::SSL::VERIFY_NONE)
@session = GoogleDrive.saved_session("config.json") # make sure to have your config.json file saved in the same folder
@ws = @session.spreadsheet_by_key(GOOGLE_SHEET_ID).worksheets[0]
@ws.title = "Results - Updated #{Time.now.strftime("%m/%d/%Y")}" # updates sheet title
@ws.save rescue nil
Spidr.site(CRAWL_SITE) do |spider|
@idx = 0
spider.every_page { |page|
@idx = @idx + 1
page.search('//meta').each do |meta|
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
value = meta.attributes['content']
case name.to_s
when 'description'
info = {
:url => page.url.to_s,
:title => page.search("title").text,
:description => value,
:status => page.code
}
row = 2 + @idx
unless info.nil?
@ws[row,1] = info[:url] || ''
@ws[row,2] = info[:title] || ''
@ws[row,3] = info[:description] || ''
@ws[row,4] = info[:status] || ''
@ws.save
end
else
end
end
}
end
require 'spidr' # gem install spidr
require 'nokogiri'
require 'google_drive'
GOOGLE_SHEET_ID = "1k26clMVYUX5PLag5K2Ku4osgZcTtWp-wlss1B6CC_KE" # replace with your Google Sheet Id
CRAWL_SITE = "https://www.mywebsite.com"
OpenSSL::SSL.send(:remove_const, :VERIFY_PEER)
OpenSSL::SSL.const_set(:VERIFY_PEER, OpenSSL::SSL::VERIFY_NONE)
@session = GoogleDrive.saved_session("config.json") # make sure to have your config.json file saved in the same folder
@ws = @session.spreadsheet_by_key(GOOGLE_SHEET_ID).worksheets[0]
@ws.title = "Results - Updated #{Time.now.strftime("%m/%d/%Y")}" # updates sheet title
@ws.save rescue nil
Spidr.site(CRAWL_SITE) do |spider|
@idx = 0
spider.every_page { |page|
@idx = @idx + 1
page.search('//meta').each do |meta|
name = (meta.attributes['name'] || meta.attributes['http-equiv'])
value = meta.attributes['content']
case name.to_s
when 'description'
info = {
:url => page.url.to_s,
:title => page.search("title").text,
:description => value,
:status => page.code
}
row = 2 + @idx
unless info.nil?
@ws[row,1] = info[:url] || ''
@ws[row,2] = info[:title] || ''
@ws[row,3] = info[:description] || ''
@ws[row,4] = info[:status] || ''
@ws.save
end
else
end
end
}
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment