Created
March 27, 2019 13:00
-
-
Save arvind02/f86900107037bff806f42da406e5b364 to your computer and use it in GitHub Desktop.
Simple ruby script to parse html file and download image resource. it stores all downloaded images into image directory of working directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'net/http' | |
require 'nokogiri' | |
# This script parse and download image files from html documents | |
def get_html(url) | |
uri = URI(url) | |
response = Net::HTTP.start(uri.host, uri.port, | |
:use_ssl => uri.scheme == 'https') do |http| | |
resp = http.get(uri.path) | |
case resp | |
when Net::HTTPSuccess then | |
resp.body | |
when Net::HTTPRedirection then | |
warn "redirect to #{location}" | |
resp.body | |
else | |
resp.value | |
end | |
end | |
end | |
def parse_html(html) | |
html_doc = Nokogiri::HTML(html) | |
nodes = html_doc.xpath("//img[@src]") | |
raise "No <img .../> tags!" if nodes.empty? | |
nodes.inject([]) do |uris, node| | |
uris << node.attr('src').strip | |
end.uniq | |
end | |
def downloader(url, paths) | |
host_uri = URI(url) | |
Dir.mkdir('images') | |
Net::HTTP.start(host_uri.host, host_uri.port, | |
:use_ssl => host_uri.scheme == 'https') { |http| | |
paths.each do |path| | |
puts "Downloading: " + path | |
begin | |
resp = http.get(path) | |
open(File.join('images', path.split('/')[-1]), "wb") do |file| | |
file.write(resp.body) | |
end | |
rescue | |
test_response(resp) | |
end | |
puts "--100%--" | |
end | |
} | |
end | |
def test_response(resp) | |
case resp | |
when Net::HTTPServerError | |
puts 'HTTPServerError' | |
when Net::HTTPClientError | |
puts 'HTTPClientError' | |
when Net::HTTPRedirection | |
puts 'HTTPRedirection' | |
when Net::HTTPSuccess | |
puts 'OK' | |
else | |
puts 'UNKNOWN' | |
end | |
end | |
## process start here | |
URL = 'https://www.evault.com/support/customer_login.html' | |
html = get_html(URL) | |
paths = parse_html(html) | |
# download all imgs | |
downloader(URL, paths) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment