Skip to content

Instantly share code, notes, and snippets.

@ganine
Created October 22, 2012 03:44
Show Gist options
  • Save ganine/3929520 to your computer and use it in GitHub Desktop.
Save ganine/3929520 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'json'
module PsyCrawler
DOMAIN = "http://www.psydb.net"
def self.get_artists
['0', *('a'..'z')].each do |i|
page = Nokogiri::HTML(open("#{DOMAIN}/artists/#{i}/"))
artists = []
page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]//tr//td[3]').each do |td|
td.to_s.match(/<a href="(\/artists\/[0a-z]\/.+\.php)">(.+)<\/a>/) do |m|
artists << Hash[:name, m[2], :link, DOMAIN + m[1]]
end
end
dir_name = "artists/#{i}"
Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")
file_name = "#{i}_#{artists.size}"
File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
puts "writing: #{file_name}"
file.write(artists.to_json)
end
end
end
IGNORED_DATA = %w(name active tracks remixes updated)
def self.get_data(index, artist)
page = Nokogiri::HTML(open(artist[:link]))
page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]/tr[2]/td/table//tr').each do |tr|
unless tr.at_css("td.TBB").nil?
key = tr.at_css("td.TBB").content.downcase unless tr.at_css("td.TBB").nil?
value = tr.at_css("td.TB").content unless tr.at_css("td.TB").nil?
artist[key.to_sym] = value unless IGNORED_DATA.include? key
end
end
dir_name = "artists/#{index}"
Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")
file_name = artist[:name].gsub(/(\s)/, '_').downcase
File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
puts "writing: #{file_name}"
file.write(artist.to_json)
end
end
end
PsyCrawler.get_artists
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment