Skip to content

Instantly share code, notes, and snippets.

@wataken44
Created April 26, 2019 02:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wataken44/f7124133ad6671a4cb57c8c9b284624a to your computer and use it in GitHub Desktop.
Save wataken44/f7124133ad6671a4cb57c8c9b284624a to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
# recursive.rb
require 'logger'
require 'open-uri'
require 'uri'
$logger = Logger.new(STDOUT)
def is_directory(url)
u = URI.parse(url)
return u.path.end_with?("/") || u.path.end_with?("/index.html") || u.path.end_with?("index.htm")
end
def is_target_file(url, pattern)
return pattern.match(url) != nil
end
def is_child_path(url, root_url)
return url.index(root_url)
end
def remove_first_slash(s)
if s.start_with?("/") then
return s.sub("/","")
end
return s
end
def remove_root_path(path, root_path)
return remove_first_slash(path.sub(root_path, ""))
end
def process_directory(url)
$logger.info("processing directory: %s" % url)
fp = open(url)
body = fp.read()
fp.close()
hrefs = body.scan(/href="([^"]+)"/).map{|a| next a[0]}
u = URI.parse(url)
urls = hrefs.map{|h|
if h.size == 0 then
next url # never reach
elsif h[0] == "/" then
next "%s://%s%s" % [u.scheme, u.host, h]
elsif h.index("://") then
next h
else
next URI.join(u, h).to_s
end
}
$logger.info("found: %d links" % urls.size)
return urls
end
def process_target_file(url, root_url)
$logger.info("processing target_file: %s" % url)
root_path = remove_first_slash(URI.parse(root_url).path)
path = remove_first_slash(URI.parse(url).path)
dir = File.dirname(remove_root_path(path, root_path))
system("mkdir -p %s" % dir)
u = URI.parse(url)
base = File.basename(u.path)
command = "wget -q '%s' -O '%s/%s'" % [url, dir, base]
$logger.info("execute: %s" % command)
system(command)
end
def process_url(url, root_url, pattern)
$logger.info("processing: %s" % url)
if !is_child_path(url, root_url) then
$logger.info("skip: not child of %s" % root_url)
return [[], false]
end
if is_directory(url) then
return [process_directory(url), true]
end
if is_target_file(url, pattern) then
process_target_file(url, root_url)
return [[], true]
end
$logger.info("skip: neither directory nor target")
return [[], false]
end
def main()
root_url = ARGV[0]
pattern = Regexp.compile(ARGV[1])
queue = [root_url]
done = []
while(queue.size > 0) do
url = queue.shift
next if done.index(url)
urls,wait = process_url(url, root_url ,pattern)
queue += urls
done << url
sleep(2 + 2 * rand()) if wait
end
end
if __FILE__ == $0 then
main()
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment