Skip to content

Instantly share code, notes, and snippets.

@arjunvenkat
Created December 12, 2012 19:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arjunvenkat/4270589 to your computer and use it in GitHub Desktop.
Save arjunvenkat/4270589 to your computer and use it in GitHub Desktop.
scraper to save chord information for songs on ultimate guitar
require 'open-uri'
require 'csv'
require 'Mechanize'
require 'awesome_print'
# letter_array = ['0-9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
# 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
# 'u', 'v', 'w', 'x', 'y', 'z']
letter_array = ['b']
letter_array.each do |letter|
i = 0
k = 8
letter_url = "http://www.ultimate-guitar.com/bands/#{letter}8.htm" # watch out to make sure k and the end of this url reset back to 1 and nothing, respectively
songs = [] # initialize an empty array of songs
agent = Mechanize.new
letter_page = agent.get(letter_url) # creates mechanize page for the first page of a particular letter
until letter_page == nil
table = letter_page.search('td[style="padding:8px"] table[cellpadding="2"]') # selects the table with band names/links
puts "page: #{k}"
table.css('tr').each_with_index do |row, j|
next if j == 0 # the first row in the table is blank
puts i
# next if i > 10 # make sure you comment this line out, just for limiting data during testing
band_name = row.css('td:nth-child(2) > a').text.rpartition(" Tabs")[0] # removes " Tabs" from the band name
band_page_url = row.css('td:nth-child(2) > a').attr('href').text
next if band_page_url.include?('tp.ultimate') || band_page_url.include?('lyricsmode')
puts "Checking #{band_name} at #{band_page_url}"
band_page = agent.get(band_page_url) # creates mechanzie page for a particular band
until band_page == nil
table = band_page.search('table[cellspacing="0"][cellpadding="2"]') #selects table with song names/links
table.css('tr').each do |row|
# next if row.css('td:nth-child(3)').blank?
if row.css('td:nth-child(3)').text == "Chords" # only move forward if the link is to a "chord" page
rating = 0
if row.css('td:nth-child(2)').text.scan(/\[/)[0] == '[' # checks if there are any reviews
rating = row.css('td:nth-child(2) img').attr('src').text.rpartition('/r')[2].rpartition('.gif')[0].to_i # pulls out star rating
num_ratings = row.css('td:nth-child(2)').text.rpartition('[ ')[2].rpartition(' ]')[0].to_i # pulls out number of ratings
end
if rating > 3 && num_ratings > 2
chord_page_link = row.css('td:nth-child(1) > a').attr('href').text # pulls out the url of a "chord" page for a song
puts chord_page_link
unless chord_page_link.include?("lyricsmode")
song_page = agent.get(chord_page_link)
song_title = song_page.search('td.fs-10 h1').text.rpartition(" Chords")[0] # removes " Chords" from the band name
song_chords_array = []
song_page.search('pre').css('span').each do |chord| # goes through the chord page and pulls out all the chords. Conveniently, they happen to be in span tags. Be careful, because other stuff might be in span tags as well
song_chords_array << chord.text
end
song_chords_array.uniq! # makes sure items in the song_chords array aren't repeated
songs << [band_name, song_title, song_chords_array, rating, num_ratings, chord_page_link]
puts "#{song_title} saved"
end
end
end
end
if band_page.link_with(:text => /^Next/) != nil # checks to see if there is a next link on the bottom of the page
band_page = band_page.link_with(:text => /^Next/).click
else
band_page = nil # if mechanize has come to the last page for a band, sets band_page to nil
end
end
puts "#{band_name} completed \n "
i += 1
end
CSV.open("#{letter}_bands_pg#{k}.csv", "wb") do |csv|
csv << ["Band Name", "Song Title", "Song Chords", "Rating", "Number of Ratings" "Chord Page Link"]
songs.each do |song|
csv << song
end
end
songs = []
if letter_page.link_with(:text => /^Next/) != nil # checks to see if there is a next link on the bottom of the page
letter_page = letter_page.link_with(:text => /^Next/).click
else
letter_page = nil # if mechanize has come to the last page for a letter, sets letter_page to nil
end
k += 1
end
puts "CSV saved for #{letter} bands"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment