michaeldv/gist:193122

## gistfile1.rb
#! /usr/bin/env ruby

Main {
  Home = File.expand_path(ENV["HOME"] || ENV["USERPROFILE"] || "~")
  Basedir = File.join(Home, "mp3")
  Threads = 8

  description <<-txt
    mp3scrape will scour any url for it's mp3 content - the script mirrors,
    never downloading the same file twice.  it does not, however, crawl a
    website for links, it simple scapes all the songs from a single page.
  txt

  usage['INSTALL'] = 'sudo gem install mp3scrape'

  usage['URI'] = 'http://codeforpeople.com'

  example <<-txt
    1) get a bunch of xmas tunes
      mp3scrape http://fuelfriends.blogspot.com/2007/12/christmas-mixery.html

    2) get a bunch of tunes
      mp2scrape http://troubledsoulsunite.blogspot.com/
  txt

  argument("uri"){
    description "the uri to scrape"
    cast :uri
  }

  option("pattern", "p"){
    description "specifiy the mp3 pattern"
    argument_required
    default %|['"](http://[^\\s]+[^/\\s]+.mp3)["']|
  }

  option("basedir", "b"){
    description "specifiy the base download dir - default(#{ Basedir })"
    argument_required
    default Basedir
  }

  option("destination", "d"){
    description "specifiy the absolute download dir - default(#{ File.join Basedir, 'auto-based-on-uri' })"
    argument_required
  }

  option("list"){
    description "only list the mp3s that would be scraped"
  }

  option("threads", "t"){
    description "specify the number of threads to download with in parallel - default(#{ Threads })"
    argument_required
    default Threads
    cast :integer
  }

  option("noop", "n"){
    description "show the downloads that would be performed"
  }

  def run
    uri = param["uri"].value
    pattern = %r/#{ param["pattern"].value }/
    srcs = open(uri.to_s).read.scan(pattern).flatten.compact
    if param["list"].given?
      puts srcs
      exit
    end
    dsts = destinations_for srcs, param["destination"].value
    spec = srcs.zip dsts
    if param["noop"].given?
      spec.each{|src, dst| puts "#{ src } -> #{ dst }"}
    else
      mirror spec
    end
  end

  def mirror spec
    spec.threadify(params["threads"].value) do |src, dst|
      begin
        FileUtils.mkdir_p(File.dirname(dst))
        mtime = File.stat(dst).mtime rescue Time.at(0)
        open src do |fd|
          last_modified = fd.last_modified || Time.now
          unless last_modified > mtime
            print "#{ src } == #{ dst }"
            break
          end
          data = fd.read and fd.close
          open(dst, "wb"){|fd| fd.write data}
          File.utime last_modified, last_modified, dst
          print "#{ src } -> #{ dst }"
        end
      rescue Object => e
        STDERR.puts "#{ e.message } (#{ e.class })"
      end
    end
  end

  def destinations_for srcs, destination = nil
    srcs.map do |src|
      basename = File.basename src
      basename = clean basename
      File.expand_path(
        if destination
          File.join destination, basename
        else
          uri = URI.parse src.to_s
          host, paths = uri.host, uri.path.split("/").map{|path| clean path}
          basename = clean paths.pop
          [ Basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
        end
      )
    end
  end

  def clean basename
    CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
  end
}

BEGIN {
  require "yaml"
  require "uri"
  require "open-uri"
  require "fileutils"
  require "cgi"

  begin
    require "rubygems"
  rescue LoadError
    42
  end

  begin
    require "main"
  rescue LoadError
    STDERR.puts "gem install main"
    exit 1
  end

  begin
    require "threadify"
  rescue LoadError
    STDERR.puts "gem install threadify"
    exit 1
  end

  STDERR.sync = STDOUT.sync = true

  trap("INT"){ exit! }
}
	#! /usr/bin/env ruby

	Main {
	Home = File.expand_path(ENV["HOME"] \|\| ENV["USERPROFILE"] \|\| "~")
	Basedir = File.join(Home, "mp3")
	Threads = 8

	description <<-txt
	mp3scrape will scour any url for it's mp3 content - the script mirrors,
	never downloading the same file twice. it does not, however, crawl a
	website for links, it simple scapes all the songs from a single page.
	txt

	usage['INSTALL'] = 'sudo gem install mp3scrape'

	usage['URI'] = 'http://codeforpeople.com'

	example <<-txt
	1) get a bunch of xmas tunes
	mp3scrape http://fuelfriends.blogspot.com/2007/12/christmas-mixery.html

	2) get a bunch of tunes
	mp2scrape http://troubledsoulsunite.blogspot.com/
	txt

	argument("uri"){
	description "the uri to scrape"
	cast :uri
	}

	option("pattern", "p"){
	description "specifiy the mp3 pattern"
	argument_required
	default %\|['"](http://[^\\s]+[^/\\s]+.mp3)["']\|
	}

	option("basedir", "b"){
	description "specifiy the base download dir - default(#{ Basedir })"
	argument_required
	default Basedir
	}

	option("destination", "d"){
	description "specifiy the absolute download dir - default(#{ File.join Basedir, 'auto-based-on-uri' })"
	argument_required
	}

	option("list"){
	description "only list the mp3s that would be scraped"
	}

	option("threads", "t"){
	description "specify the number of threads to download with in parallel - default(#{ Threads })"
	argument_required
	default Threads
	cast :integer
	}

	option("noop", "n"){
	description "show the downloads that would be performed"
	}

	def run
	uri = param["uri"].value
	pattern = %r/#{ param["pattern"].value }/
	srcs = open(uri.to_s).read.scan(pattern).flatten.compact
	if param["list"].given?
	puts srcs
	exit
	end
	dsts = destinations_for srcs, param["destination"].value
	spec = srcs.zip dsts
	if param["noop"].given?
	spec.each{\|src, dst\| puts "#{ src } -> #{ dst }"}
	else
	mirror spec
	end
	end

	def mirror spec
	spec.threadify(params["threads"].value) do \|src, dst\|
	begin
	FileUtils.mkdir_p(File.dirname(dst))
	mtime = File.stat(dst).mtime rescue Time.at(0)
	open src do \|fd\|
	last_modified = fd.last_modified \|\| Time.now
	unless last_modified > mtime
	print "#{ src } == #{ dst }"
	break
	end
	data = fd.read and fd.close
	open(dst, "wb"){\|fd\| fd.write data}
	File.utime last_modified, last_modified, dst
	print "#{ src } -> #{ dst }"
	end
	rescue Object => e
	STDERR.puts "#{ e.message } (#{ e.class })"
	end
	end
	end

	def destinations_for srcs, destination = nil
	srcs.map do \|src\|
	basename = File.basename src
	basename = clean basename
	File.expand_path(
	if destination
	File.join destination, basename
	else
	uri = URI.parse src.to_s
	host, paths = uri.host, uri.path.split("/").map{\|path\| clean path}
	basename = clean paths.pop
	[ Basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
	end
	)
	end
	end

	def clean basename
	CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
	end
	}

	BEGIN {
	require "yaml"
	require "uri"
	require "open-uri"
	require "fileutils"
	require "cgi"

	begin
	require "rubygems"
	rescue LoadError
	42
	end

	begin
	require "main"
	rescue LoadError
	STDERR.puts "gem install main"
	exit 1
	end

	begin
	require "threadify"
	rescue LoadError
	STDERR.puts "gem install threadify"
	exit 1
	end

	STDERR.sync = STDOUT.sync = true

	trap("INT"){ exit! }
	}