Skip to content

Instantly share code, notes, and snippets.

@adamburmister
Created April 9, 2010 12:05
Show Gist options
  • Save adamburmister/361093 to your computer and use it in GitHub Desktop.
Save adamburmister/361093 to your computer and use it in GitHub Desktop.
Example for crawling the music site, hypem.com, for music.
#!/usr/bin/ruby
require "rubygems"
require "hpricot"
require "open-uri"
require "builder"
require "httpclient"
require "yaml"
require "cgi"
module Flog
class Radio
CRAWL_CONFIG = YAML.load_file(File.join(File.dirname(__FILE__), '/flog_radio_config.yml'))
def initialize()
puts "Downloading MP3s from #{CRAWL_CONFIG['website_url']}"
download_mp3s scrape_mp3_urls grab_page CRAWL_CONFIG['website_url']
upload_to_webdav
end
def grab_page(url)
Hpricot(open(url, "User-Agent" => CRAWL_CONFIG['user_agent']))
end
def scrape_mp3_urls(html)
urls = Array.new
(html/"#recently-posted a").each do |el|
if el.to_html =~ /onmousedown=\"this.href='\/go\/track\/\d*'; return false;\"/
urls.push el.attributes['href']
end
end
urls.uniq!
end
def download_mp3s(urls = [])
urls.each do |url|
`wget -r -l1 -H -t1 -nd -N -np -A.mp3 -erobots=off -U "#{CRAWL_CONFIG['user_agent']}" -P #{File.join(File.dirname(__FILE__), CRAWL_CONFIG['download_dir'])} #{url}`
end
end
def upload_to_webdav
username = CRAWL_CONFIG['webdav_username']
password = CRAWL_CONFIG['webdav_password']
url = URI.parse(CRAWL_CONFIG['webdav_url'])
c = HTTPClient.new
mp3s = Dir.glob(CRAWL_CONFIG['download_dir'] + "/*.mp3").sort_by { |f| File.file?(f) ? File.mtime(f) : Time.mktime(0) }
mp3s.each do |mp3|
dst = "#{url}#{CGI::escape(File.basename(mp3)).gsub('+','_')}"
puts "Uploading to #{dst}"
c.set_auth(url, username, password)
c.put(dst, File.open(mp3).read)
end
end
end
end
Flog::Radio.new()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment