Skip to content

Instantly share code, notes, and snippets.

@michaeldv
Forked from ahoward/gist:192968
Created September 24, 2009 23:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save michaeldv/193122 to your computer and use it in GitHub Desktop.
Save michaeldv/193122 to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby
Main {
Home = File.expand_path(ENV["HOME"] || ENV["USERPROFILE"] || "~")
Basedir = File.join(Home, "mp3")
Threads = 8
description <<-txt
mp3scrape will scour any url for it's mp3 content - the script mirrors,
never downloading the same file twice. it does not, however, crawl a
website for links, it simple scapes all the songs from a single page.
txt
usage['INSTALL'] = 'sudo gem install mp3scrape'
usage['URI'] = 'http://codeforpeople.com'
example <<-txt
1) get a bunch of xmas tunes
mp3scrape http://fuelfriends.blogspot.com/2007/12/christmas-mixery.html
2) get a bunch of tunes
mp2scrape http://troubledsoulsunite.blogspot.com/
txt
argument("uri"){
description "the uri to scrape"
cast :uri
}
option("pattern", "p"){
description "specifiy the mp3 pattern"
argument_required
default %|['"](http://[^\\s]+[^/\\s]+.mp3)["']|
}
option("basedir", "b"){
description "specifiy the base download dir - default(#{ Basedir })"
argument_required
default Basedir
}
option("destination", "d"){
description "specifiy the absolute download dir - default(#{ File.join Basedir, 'auto-based-on-uri' })"
argument_required
}
option("list"){
description "only list the mp3s that would be scraped"
}
option("threads", "t"){
description "specify the number of threads to download with in parallel - default(#{ Threads })"
argument_required
default Threads
cast :integer
}
option("noop", "n"){
description "show the downloads that would be performed"
}
def run
uri = param["uri"].value
pattern = %r/#{ param["pattern"].value }/
srcs = open(uri.to_s).read.scan(pattern).flatten.compact
if param["list"].given?
puts srcs
exit
end
dsts = destinations_for srcs, param["destination"].value
spec = srcs.zip dsts
if param["noop"].given?
spec.each{|src, dst| puts "#{ src } -> #{ dst }"}
else
mirror spec
end
end
def mirror spec
spec.threadify(params["threads"].value) do |src, dst|
begin
FileUtils.mkdir_p(File.dirname(dst))
mtime = File.stat(dst).mtime rescue Time.at(0)
open src do |fd|
last_modified = fd.last_modified || Time.now
unless last_modified > mtime
print "#{ src } == #{ dst }"
break
end
data = fd.read and fd.close
open(dst, "wb"){|fd| fd.write data}
File.utime last_modified, last_modified, dst
print "#{ src } -> #{ dst }"
end
rescue Object => e
STDERR.puts "#{ e.message } (#{ e.class })"
end
end
end
def destinations_for srcs, destination = nil
srcs.map do |src|
basename = File.basename src
basename = clean basename
File.expand_path(
if destination
File.join destination, basename
else
uri = URI.parse src.to_s
host, paths = uri.host, uri.path.split("/").map{|path| clean path}
basename = clean paths.pop
[ Basedir, host, paths, basename ].flatten.compact.join(File::SEPARATOR)
end
)
end
end
def clean basename
CGI.unescape(basename.to_s).gsub(%r/[^0-9a-zA-Z_@)(~.-]/, '_').gsub(%r/_+/,'_')
end
}
BEGIN {
require "yaml"
require "uri"
require "open-uri"
require "fileutils"
require "cgi"
begin
require "rubygems"
rescue LoadError
42
end
begin
require "main"
rescue LoadError
STDERR.puts "gem install main"
exit 1
end
begin
require "threadify"
rescue LoadError
STDERR.puts "gem install threadify"
exit 1
end
STDERR.sync = STDOUT.sync = true
trap("INT"){ exit! }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment