Skip to content

Instantly share code, notes, and snippets.

@isratmir
Last active May 6, 2016 15:11
Show Gist options
  • Save isratmir/6a16a46187ea73b165ba to your computer and use it in GitHub Desktop.
Save isratmir/6a16a46187ea73b165ba to your computer and use it in GitHub Desktop.
Site info parser
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'json'
require 'net/http'
require 'uri'
require 'open_uri_redirections'
require 'openssl'
def ping url
begin
uri = URI.parse(url)
Net::HTTP.get_response(uri).code
rescue Errno::ECONNREFUSED
false
rescue Errno::ETIMEDOUT
false
rescue Net::ReadTimeout
false
rescue SocketError
false
end
end
def crawl link
begin
html = open(link, :read_timeout => 300, :allow_redirections => :safe, :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE)
p = Nokogiri::HTML(html.read)
p.encoding = 'utf-8'
title = p.css('title').text
desc = p.css('meta[name="description"]').xpath('@content')
keyword = p.css('meta[name="keywords"]').xpath('@content')
puts "URL: " + link
puts "Title: " + title.to_s
puts "Description: " + desc.to_s
puts "Keywords: " + keyword.to_s
puts "******************************"
j = {
:url => link,
:title => title.to_s,
:description => desc.to_s,
:keyword => keyword.to_s,
}
rescue OpenURI::HTTPError
false
end
end
urls = File.read('urls.json')
rows = JSON.parse(urls)
arr = []
index = 0
file = open('sites.json','w')
rows.each do |r|
index = index + 1
puts index
link = r[1]
status = ping(link)
puts status
puts link
if status != false
line = crawl link
if line != false
#arr.push(line)
file.write(line)
file.write("\n")
else
next
end
else
next
end
end
file.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment