Created
April 26, 2019 02:29
-
-
Save wataken44/f7124133ad6671a4cb57c8c9b284624a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# -*- coding: utf-8 -*- | |
# recursive.rb | |
require 'logger' | |
require 'open-uri' | |
require 'uri' | |
$logger = Logger.new(STDOUT) | |
def is_directory(url) | |
u = URI.parse(url) | |
return u.path.end_with?("/") || u.path.end_with?("/index.html") || u.path.end_with?("index.htm") | |
end | |
def is_target_file(url, pattern) | |
return pattern.match(url) != nil | |
end | |
def is_child_path(url, root_url) | |
return url.index(root_url) | |
end | |
def remove_first_slash(s) | |
if s.start_with?("/") then | |
return s.sub("/","") | |
end | |
return s | |
end | |
def remove_root_path(path, root_path) | |
return remove_first_slash(path.sub(root_path, "")) | |
end | |
def process_directory(url) | |
$logger.info("processing directory: %s" % url) | |
fp = open(url) | |
body = fp.read() | |
fp.close() | |
hrefs = body.scan(/href="([^"]+)"/).map{|a| next a[0]} | |
u = URI.parse(url) | |
urls = hrefs.map{|h| | |
if h.size == 0 then | |
next url # never reach | |
elsif h[0] == "/" then | |
next "%s://%s%s" % [u.scheme, u.host, h] | |
elsif h.index("://") then | |
next h | |
else | |
next URI.join(u, h).to_s | |
end | |
} | |
$logger.info("found: %d links" % urls.size) | |
return urls | |
end | |
def process_target_file(url, root_url) | |
$logger.info("processing target_file: %s" % url) | |
root_path = remove_first_slash(URI.parse(root_url).path) | |
path = remove_first_slash(URI.parse(url).path) | |
dir = File.dirname(remove_root_path(path, root_path)) | |
system("mkdir -p %s" % dir) | |
u = URI.parse(url) | |
base = File.basename(u.path) | |
command = "wget -q '%s' -O '%s/%s'" % [url, dir, base] | |
$logger.info("execute: %s" % command) | |
system(command) | |
end | |
def process_url(url, root_url, pattern) | |
$logger.info("processing: %s" % url) | |
if !is_child_path(url, root_url) then | |
$logger.info("skip: not child of %s" % root_url) | |
return [[], false] | |
end | |
if is_directory(url) then | |
return [process_directory(url), true] | |
end | |
if is_target_file(url, pattern) then | |
process_target_file(url, root_url) | |
return [[], true] | |
end | |
$logger.info("skip: neither directory nor target") | |
return [[], false] | |
end | |
def main() | |
root_url = ARGV[0] | |
pattern = Regexp.compile(ARGV[1]) | |
queue = [root_url] | |
done = [] | |
while(queue.size > 0) do | |
url = queue.shift | |
next if done.index(url) | |
urls,wait = process_url(url, root_url ,pattern) | |
queue += urls | |
done << url | |
sleep(2 + 2 * rand()) if wait | |
end | |
end | |
if __FILE__ == $0 then | |
main() | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment