Created
May 9, 2010 05:51
-
-
Save jtprince/394967 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'mechanize' | |
class LDSGeneralConferenceURLFinder | |
MONTH_TO_NUM = { | |
'April' => 4, | |
'October' => 10, | |
} | |
LDS_ORG = "http://www.lds.org" | |
TOC_URL = "http://www.lds.org/conference/display/0,5234,23-1,00.html" | |
REJECT = { | |
complete_session: /-general-session|Complete_GeneralYoungWomen|Complete.*Session|Complete.*Meeting|Complete_.*ReliefSociety|6000-general-young-women-meeting|3000-priesthood-session|[1-6]0_000\.mp3$/, | |
highlights: /general-conference-highlights|Complete_ConferenceHighlights/, | |
auditing: /auditing/i, | |
statistics: /statistical/i, | |
sustaining: /sustaining-?of-?church-?officers/i, | |
} | |
# a hash in the format 'April|October YYYY' => url | |
attr_accessor :conf_hash | |
def initialize | |
@agent = Mechanize.new | |
@conf_hash = get_conf_hash | |
end | |
# returns a hash of all conference download pages with keys in the form | |
# 'April YYYY' or 'October YYYY' and full url's as the values. | |
def get_conf_hash(base=TOC_URL, lds_org_base=LDS_ORG) | |
conf_hash = nil | |
@agent.get(base) do |page| | |
conference_links = page.links.select do |link| | |
text = link.text.strip | |
text.split(", ").last | |
text =~ /April|October/ | |
end | |
conf_hash = conference_links.inject({}) do |hash, link| | |
string = link.text.strip.split(", ").last # eg 'April 2010' | |
(month, year) = string.split(/\s+/) | |
month_num = MONTH_TO_NUM[month] | |
year_num = Integer(year) | |
hash[[month_num, year_num]] = lds_org_base + link.href | |
hash | |
end | |
end | |
conf_hash | |
end | |
# takes month number (4 or 10) and the year and returns a list of urls | |
# reject is an array of regexps to reject. month and year may be as string | |
# or integer. | |
def mp3_links(month_num, year, reject=REJECT.values) | |
url = @conf_hash[[Integer(month_num), Integer(year)]] | |
raise ArgumentError, "Can't find url for: [#{month_num},#{year}]" unless (url && url.size > 0) | |
mp3_urls = [] | |
@agent.get(url) do |page| | |
mp3_links = page.links.select do |link| | |
link.href =~ /\.mp3$/ | |
end | |
reject.each do |v| | |
mp3_links.reject! {|link| link.href =~ v } | |
end | |
mp3_urls = mp3_links.map {|link| link.href } | |
end | |
mp3_urls | |
end | |
end | |
###### Example usage ###### | |
require 'fileutils' | |
gc = LDSGeneralConferenceURLFinder.new | |
now = Time.now | |
(2006..now.year).each do |year| | |
[4,10].each do |month| | |
next if year == now.year && now.month < month | |
dir = "%d-%0.2d" % [year, month] | |
FileUtils.mkdir(dir) unless File.exist?(dir) | |
Dir.chdir(dir) do | |
File.open("links.txt", 'w') do |out| | |
gc.mp3_links(month, year).each do |url| | |
out.puts "wget '#{url}'" | |
end | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment