mguterl/gist:54598

## gistfile1.rb
#! /usr/bin/env ruby

Main {
  description <<-txt
    pimp3 is easily the greatest program ever written.  if you can't figure
    out what it does hook up some electrodes to your genitals and crank it
    up fuckers!
  txt

  author 'ara.t.howard @ gmail.com'

  example <<-txt
    . get the lastest batch of tunes posted to twitter
        prompt ~> pimp3

    . same, with more fanfare
        prompt ~> pimp3  --verbose
  txt

  option("uri"){
    description "the twitter search uri"
    cast :uri
    default 'http://search.twitter.com/search'
  }

  option("pages"){
    description "specify the the number of search pages to scrape default(#{ Default.Pages })"
    argument_required
    default Default.Pages
    cast :integer
  }

  option("hugeuri"){
    description "the twitter huge-ifying uri"
    cast :uri
    default 'http://search.twitter.com/hugeurl'
  }

  option("pattern"){
    description "specifiy the mp3 pattern"
    argument_required
    default %|(http://[^\\s]+[^/\\s]+[.]mp3)|
  }

  option("basedir", "b"){
    description "specifiy the base download dir - default(#{ Default.Basedir })"
    argument_required
    default Default.Basedir
  }

  option("destination", "d"){
    description "specifiy the absolute download dir - default(#{ File.join Default.Basedir, 'auto-based-on-uri' })"
    argument_required
  }

  option("list"){
    description "only list the mp3s that would be scraped"
  }

  option("threads"){
    description "specify the number of threads to download with in parallel - default(#{ Default.Threads })"
    argument_required
    default Default.Threads
    cast :integer
  }

  option("timeout"){
    description "specify the network timeout for some operations - default(#{ Default.Timeout })"
    argument_required
    default Default.Timeout
    cast :integer
  }

  option("noop", "n"){
    description "show the downloads that would be performed"
  }

  option("verbose", "v"){
    description "turn on verbose logging"
  }

  def run
    logger.level = Logger::DEBUG if param['verbose'].given?

    @uri = param["uri"].value
    @threads = params['threads'].value
    @pattern = Regexp.new params['pattern'].value, Regexp::IGNORECASE
    @hugeuri = param['hugeuri'].value
    @timeout = param['timeout'].value
    @destination = param['destination'].value
    @pages = param['pages'].value
    @basedir = param['basedir'].value

    srcs = mp3_uris(expand_uris(extract_uris(parallel_curl(pages_for(@uri)))))

    (puts srcs; exit) if param["list"].given?

    dsts = destinations_for srcs, param["destination"].value
    spec = srcs.zip dsts

    if param["noop"].given?
      spec.each{|src, dst| puts "#{ src } -> #{ dst }"}
    else
      mirror spec
    end
  end

  def pages_for uri, &block
    page = 1
    Array.new(@pages).map do
      uri = URI.parse uri.to_s
      uri.query = query_for(uri.query, :q => '#mp3', :page => page+=1)
      uri
    end
  end

  Curl = {}

  def curl uri
    uri = uri.to_s
    Timeout.timeout(@timeout) do
      debug{ "curl(#{ uri })" }
      Curl[uri] ||= open(uri){|f| f.read.strip}
    end
  rescue OpenURI::HTTPError, Timeout::Error, Errno::ENOENT => e
    debug{ "#{ e.message } (#{ e.class })" }
    Curl[uri] ||= Empty
  end

  Blacklist =
    %r|^http://www.w3.org|,
    %r|^http://[^.]*\.?twitter\.com|,
    %r|^http://s3.amazonaws.com/twitter_production|,
    Regexp.union('png', 'gif', 'tiff', 'jpg')

  def Blacklist.ed? uri
    any?{|re| re.match(uri.to_s)}
  end

  def extract_uris *strings
    log '*** EXTRACTING URIS ***'
    string = list_for(strings).join("\n")
    protocols = %w[ http ftp ]
    results = uri_list_for(protocols.map{|protocol| URI::extract(string, protocol) })
  ensure
    results.each{|uri| log " - #{ uri }"}
  end

  def expand_uris *uris
    log '*** EXPANDING URIS ***'
    results =
      uri_list_for(
        list_for(uris).threadify(@threads) do
          list_for(uris).map do |uri|
            uri = uri.to_s
            unless Blacklist.ed?(uri)
              uri = URI.parse uri.to_s
              hugeuri = URI.parse @hugeuri.to_s
              hugeuri.query = query_for(:url => uri)
              result = curl(hugeuri)
              result = uri.to_s if result.empty?
              result
            else
              debug{ "blacklisted : #{ uri }" }
              Empty
            end
          end
        end
      )
  ensure
    results.each{|uri| log " - #{ uri }"}
  end

  def query_for *args
    kvs = {}

    list_for(args).each do |arg|
      arg = CGI.parse(arg.to_s) unless Hash === arg
      arg.each{|k,v| kvs.update k.to_s => v}
    end

    list_for(
      kvs.map do |k,vs|
        list_for(vs).map do |v|
          "#{ k }=#{ CGI.escape(v.to_s) }"
        end
      end
    ).join('&')
  end

  def parallel_curl *uris
    list_for(
      list_for(uris).threadify(@threads) do |uri|
        begin
          curl(uri.to_s)
        rescue OpenURI::HTTPError
          nil
        rescue Object => e
          warn "#{ e.message } (#{ e.class })"
          nil
        end
      end
    )
  end

  def mp3_uris *uris
    log "*** FILTERING MP3 LINKS ***"
    results =
      uri_list_for(
        list_for(uris).threadify(@threads) do |uri|
          uri.to_s =~ @pattern ? uri : curl(uri).scan(@pattern)
        end
      )
  ensure
    results.each{|uri| log " - #{ uri }"}
  end

  def mirror spec
    log "*** MIRRORING MP3 LINKS ***"
    spec.threadify(@threads) do |src, dst|
      begin
        FileUtils.mkdir_p(File.dirname(dst))
        mtime = File.stat(dst).mtime rescue Time.at(0)
        last_modified = last_modified_for(src)

        unless last_modified > mtime
          log " - #{ src } == #{ dst }"
          break
        end

        open src do |fd|
          data = fd.read and fd.close
          open(dst, "wb"){|fd| fd.write data}
          File.utime last_modified, last_modified, dst
          log " - #{ src } -> #{ dst }"
        end
      rescue Object => e
        log " - #{ src } ! #{ e.message } (#{ e.class })"
      end
    end
  end

  def last_modified_for uri
    now = Time.now
    uri = URI.parse uri.to_s
    host = uri.host
    port = uri.port
    path = uri.path

    Net::HTTP.start(host, port) {|http|
      response = http.head(path)
      value = response['last-modified']
      value ? Time.httpdate(value) : now
    }
  rescue Object
    now
  end

  def destinations_for srcs, destination = nil
    srcs.map do |src|
      basename = File.basename src
      basename = clean basename
      File.expand_path(
        if destination
          File.join destination, basename
        else
          uri = URI.parse src.to_s
          host, paths = uri.host, uri.path.split("/").map{|path| clean path}
          basename = clean paths.pop
          [ @basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
        end
      )
    end
  end

  def clean basename
    CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
  end

  def list_for *args
    args.flatten.compact.select{|arg| not arg.to_s.strip.empty?}
  end

  def uri_list_for *args
    args = args.flatten.compact.select{|arg| not arg.to_s.strip.empty?}
    args.uniq!
    args.delete_if{|uri| uri.empty? or Blacklist.ed?(uri)}
    args.sort
  end

  def uri_for arg
    URI === arg ? arg : URI.parse(arg.to_s)
  end

  def log *messages
    list_for(messages).each{|message| logger << message.to_s.chomp; logger << "\n"}
  end
}


BEGIN {
  require "yaml"
  require "timeout"
  require "uri"
  require "open-uri"
  require "fileutils"
  require "cgi"
  require 'net/http'
  require 'time'
  require 'ostruct'

  begin
    require "rubygems"
  rescue LoadError
    42
  end

  begin
    require "main"
  rescue LoadError
    STDERR.puts "gem install main"
    exit 1
  end

  begin
    require "threadify"
  rescue LoadError
    STDERR.puts "gem install threadify"
    exit 1
  end

  STDERR.sync = STDOUT.sync = true

  trap("INT"){ exit! }

  This = File.basename(__FILE__)

  def This.home
    home =
      catch :home do
        ["HOME", "USERPROFILE"].each do |key|
          throw(:home, ENV[key]) if ENV[key]
        end
        if ENV["HOMEDRIVE"] and ENV["HOMEPATH"]
          throw(:home, "#{ ENV['HOMEDRIVE'] }:#{ ENV['HOMEPATH'] }")
        end
        File.expand_path("~") rescue(File::ALT_SEPARATOR ? "C:/" : "/")
      end
    File.expand_path home
  end

  Default = OpenStruct.new
  Default.Home = This.home
  Default.Basedir = File.join(Default.Home, "mp3", This)
  Default.Threads = 8
  Default.Timeout = 42
  Default.Pages = 4

  Empty = String.new.freeze
}
	#! /usr/bin/env ruby

	Main {
	description <<-txt
	pimp3 is easily the greatest program ever written. if you can't figure
	out what it does hook up some electrodes to your genitals and crank it
	up fuckers!
	txt

	author 'ara.t.howard @ gmail.com'

	example <<-txt
	. get the lastest batch of tunes posted to twitter
	prompt ~> pimp3

	. same, with more fanfare
	prompt ~> pimp3 --verbose
	txt

	option("uri"){
	description "the twitter search uri"
	cast :uri
	default 'http://search.twitter.com/search'
	}

	option("pages"){
	description "specify the the number of search pages to scrape default(#{ Default.Pages })"
	argument_required
	default Default.Pages
	cast :integer
	}

	option("hugeuri"){
	description "the twitter huge-ifying uri"
	cast :uri
	default 'http://search.twitter.com/hugeurl'
	}

	option("pattern"){
	description "specifiy the mp3 pattern"
	argument_required
	default %\|(http://[^\\s]+[^/\\s]+[.]mp3)\|
	}

	option("basedir", "b"){
	description "specifiy the base download dir - default(#{ Default.Basedir })"
	argument_required
	default Default.Basedir
	}

	option("destination", "d"){
	description "specifiy the absolute download dir - default(#{ File.join Default.Basedir, 'auto-based-on-uri' })"
	argument_required
	}

	option("list"){
	description "only list the mp3s that would be scraped"
	}

	option("threads"){
	description "specify the number of threads to download with in parallel - default(#{ Default.Threads })"
	argument_required
	default Default.Threads
	cast :integer
	}

	option("timeout"){
	description "specify the network timeout for some operations - default(#{ Default.Timeout })"
	argument_required
	default Default.Timeout
	cast :integer
	}

	option("noop", "n"){
	description "show the downloads that would be performed"
	}

	option("verbose", "v"){
	description "turn on verbose logging"
	}

	def run
	logger.level = Logger::DEBUG if param['verbose'].given?

	@uri = param["uri"].value
	@threads = params['threads'].value
	@pattern = Regexp.new params['pattern'].value, Regexp::IGNORECASE
	@hugeuri = param['hugeuri'].value
	@timeout = param['timeout'].value
	@destination = param['destination'].value
	@pages = param['pages'].value
	@basedir = param['basedir'].value

	srcs = mp3_uris(expand_uris(extract_uris(parallel_curl(pages_for(@uri)))))

	(puts srcs; exit) if param["list"].given?

	dsts = destinations_for srcs, param["destination"].value
	spec = srcs.zip dsts

	if param["noop"].given?
	spec.each{\|src, dst\| puts "#{ src } -> #{ dst }"}
	else
	mirror spec
	end
	end

	def pages_for uri, &block
	page = 1
	Array.new(@pages).map do
	uri = URI.parse uri.to_s
	uri.query = query_for(uri.query, :q => '#mp3', :page => page+=1)
	uri
	end
	end

	Curl = {}

	def curl uri
	uri = uri.to_s
	Timeout.timeout(@timeout) do
	debug{ "curl(#{ uri })" }
	Curl[uri] \|\|= open(uri){\|f\| f.read.strip}
	end
	rescue OpenURI::HTTPError, Timeout::Error, Errno::ENOENT => e
	debug{ "#{ e.message } (#{ e.class })" }
	Curl[uri] \|\|= Empty
	end

	Blacklist =
	%r\|^http://www.w3.org\|,
	%r\|^http://[^.]*\.?twitter\.com\|,
	%r\|^http://s3.amazonaws.com/twitter_production\|,
	Regexp.union('png', 'gif', 'tiff', 'jpg')

	def Blacklist.ed? uri
	any?{\|re\| re.match(uri.to_s)}
	end

	def extract_uris *strings
	log '* EXTRACTING URIS *'
	string = list_for(strings).join("\n")
	protocols = %w[ http ftp ]
	results = uri_list_for(protocols.map{\|protocol\| URI::extract(string, protocol) })
	ensure
	results.each{\|uri\| log " - #{ uri }"}
	end

	def expand_uris *uris
	log '* EXPANDING URIS *'
	results =
	uri_list_for(
	list_for(uris).threadify(@threads) do
	list_for(uris).map do \|uri\|
	uri = uri.to_s
	unless Blacklist.ed?(uri)
	uri = URI.parse uri.to_s
	hugeuri = URI.parse @hugeuri.to_s
	hugeuri.query = query_for(:url => uri)
	result = curl(hugeuri)
	result = uri.to_s if result.empty?
	result
	else
	debug{ "blacklisted : #{ uri }" }
	Empty
	end
	end
	end
	)
	ensure
	results.each{\|uri\| log " - #{ uri }"}
	end

	def query_for *args
	kvs = {}

	list_for(args).each do \|arg\|
	arg = CGI.parse(arg.to_s) unless Hash === arg
	arg.each{\|k,v\| kvs.update k.to_s => v}
	end

	list_for(
	kvs.map do \|k,vs\|
	list_for(vs).map do \|v\|
	"#{ k }=#{ CGI.escape(v.to_s) }"
	end
	end
	).join('&')
	end

	def parallel_curl *uris
	list_for(
	list_for(uris).threadify(@threads) do \|uri\|
	begin
	curl(uri.to_s)
	rescue OpenURI::HTTPError
	nil
	rescue Object => e
	warn "#{ e.message } (#{ e.class })"
	nil
	end
	end
	)
	end

	def mp3_uris *uris
	log "* FILTERING MP3 LINKS *"
	results =
	uri_list_for(
	list_for(uris).threadify(@threads) do \|uri\|
	uri.to_s =~ @pattern ? uri : curl(uri).scan(@pattern)
	end
	)
	ensure
	results.each{\|uri\| log " - #{ uri }"}
	end

	def mirror spec
	log "* MIRRORING MP3 LINKS *"
	spec.threadify(@threads) do \|src, dst\|
	begin
	FileUtils.mkdir_p(File.dirname(dst))
	mtime = File.stat(dst).mtime rescue Time.at(0)
	last_modified = last_modified_for(src)

	unless last_modified > mtime
	log " - #{ src } == #{ dst }"
	break
	end

	open src do \|fd\|
	data = fd.read and fd.close
	open(dst, "wb"){\|fd\| fd.write data}
	File.utime last_modified, last_modified, dst
	log " - #{ src } -> #{ dst }"
	end
	rescue Object => e
	log " - #{ src } ! #{ e.message } (#{ e.class })"
	end
	end
	end

	def last_modified_for uri
	now = Time.now
	uri = URI.parse uri.to_s
	host = uri.host
	port = uri.port
	path = uri.path

	Net::HTTP.start(host, port) {\|http\|
	response = http.head(path)
	value = response['last-modified']
	value ? Time.httpdate(value) : now
	}
	rescue Object
	now
	end

	def destinations_for srcs, destination = nil
	srcs.map do \|src\|
	basename = File.basename src
	basename = clean basename
	File.expand_path(
	if destination
	File.join destination, basename
	else
	uri = URI.parse src.to_s
	host, paths = uri.host, uri.path.split("/").map{\|path\| clean path}
	basename = clean paths.pop
	[ @basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
	end
	)
	end
	end

	def clean basename
	CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
	end

	def list_for *args
	args.flatten.compact.select{\|arg\| not arg.to_s.strip.empty?}
	end

	def uri_list_for *args
	args = args.flatten.compact.select{\|arg\| not arg.to_s.strip.empty?}
	args.uniq!
	args.delete_if{\|uri\| uri.empty? or Blacklist.ed?(uri)}
	args.sort
	end

	def uri_for arg
	URI === arg ? arg : URI.parse(arg.to_s)
	end

	def log *messages
	list_for(messages).each{\|message\| logger << message.to_s.chomp; logger << "\n"}
	end
	}


	BEGIN {
	require "yaml"
	require "timeout"
	require "uri"
	require "open-uri"
	require "fileutils"
	require "cgi"
	require 'net/http'
	require 'time'
	require 'ostruct'

	begin
	require "rubygems"
	rescue LoadError
	42
	end

	begin
	require "main"
	rescue LoadError
	STDERR.puts "gem install main"
	exit 1
	end

	begin
	require "threadify"
	rescue LoadError
	STDERR.puts "gem install threadify"
	exit 1
	end

	STDERR.sync = STDOUT.sync = true

	trap("INT"){ exit! }

	This = File.basename(__FILE__)

	def This.home
	home =
	catch :home do
	["HOME", "USERPROFILE"].each do \|key\|
	throw(:home, ENV[key]) if ENV[key]
	end
	if ENV["HOMEDRIVE"] and ENV["HOMEPATH"]
	throw(:home, "#{ ENV['HOMEDRIVE'] }:#{ ENV['HOMEPATH'] }")
	end
	File.expand_path("~") rescue(File::ALT_SEPARATOR ? "C:/" : "/")
	end
	File.expand_path home
	end

	Default = OpenStruct.new
	Default.Home = This.home
	Default.Basedir = File.join(Default.Home, "mp3", This)
	Default.Threads = 8
	Default.Timeout = 42
	Default.Pages = 4

	Empty = String.new.freeze
	}