Created
January 29, 2012 20:01
-
-
Save the55/1700387 to your computer and use it in GitHub Desktop.
scraper and word analysis tools for the somerville open studios site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# info at: http://the55.net/_12/sketch/open_studios | |
# | |
# run SomervilleScraper.new.get_artist_ids first. then :get_artist_data. then have fun. | |
# | |
URL = 'http://www.somervilleopenstudios.org/artists/artist_list.php?searchType=alpha' | |
DATA_PATH = File.dirname(__FILE__) | |
require 'rubygems' | |
require 'nokogiri' | |
require 'open-uri' | |
require 'yaml' | |
module GenericScraperTools | |
def write_yaml name, data | |
File.open(yaml_file_path(name), 'w') { |f| f << data.to_yaml} | |
end | |
def read_yaml name | |
YAML::load(File.open(yaml_file_path(name))) | |
end | |
def yaml_file_path name | |
File.join(DATA_PATH, name.sub(/'.yml'/, '')+'.yml') | |
end | |
end | |
class SomervilleScraper | |
include GenericScraperTools | |
def initialize | |
end | |
def get_artist_ids | |
if page = Nokogiri::HTML(open(URL)) | |
artist_ids = [].tap do |ids| | |
page.css('.ArtistRow2 a').map{|a| a.attribute('href').to_s}.each do |link| | |
next unless link.match(/artistID\=(\d+)/) | |
ids << $1 | |
end | |
end | |
artist_ids.uniq! | |
write_yaml('artist_ids', *artist_ids) | |
end | |
return artist_ids | |
end | |
def artist_ids | |
@artist_ids ||= read_yaml('artist_ids') | |
end | |
def get_artist_page(id) | |
url = "http://www.somervilleopenstudios.org/artists/artist_profile.php?artistID=#{id}" | |
begin | |
Nokogiri::HTML(open(url)) | |
rescue | |
puts "PAGE ERROR: #{url}" | |
end | |
end | |
def artist_data | |
@artist_data ||= read_yaml('artist_data') | |
end | |
def get_artist_data | |
puts "\n --- Artists to go: #{artist_ids.size} ---\n" | |
artist_data = [] | |
artist_ids.each_with_index do |id, i| | |
if page = get_artist_page(id) | |
one_liner = page.css('#artist_content .ArtistSubTl').first.to_s.split(/<br>/)[0].split(/>/)[1].strip | |
statement = page.css('#artist_content td[width="285"] table tr:nth-child(2) td').first | |
statement.search('table').remove | |
statement.search('br').each do |n| | |
n.replace("\n") | |
end | |
statement = statement.content.strip | |
artist_data << { | |
:name => page.css('#artist_content h1').first.content.strip, | |
:one_liner => one_liner.gsub('&', '&'), | |
:statement => statement | |
} | |
puts "#{i} #{artist_data.last[:name]}: #{artist_data.last[:one_liner]}" | |
end | |
end | |
write_yaml 'artist_data', artist_data | |
end | |
def one_liners | |
all_the :one_liner | |
end | |
def one_liner_words | |
one_liners.join(' ').downcase.split | |
end | |
def names | |
all_the :name | |
end | |
def statements | |
all_the :statement | |
end | |
def statement_words in_bits=0 | |
words = statements.join(' '). | |
gsub(/(\n|"|\.|\/|,|\r|\302|\302|\224|\223|\(|\))/, ' '). | |
gsub(/\222/,"'"). | |
downcase.split | |
if in_bits.to_i > 0 | |
words.each_with_index.map{|x,i| "#{x} #{words[i+1, in_bits-1]*' '}"} | |
else | |
words | |
end | |
end | |
def all_the param | |
artist_data.map{|artist| artist[param]} | |
end | |
def average_name | |
names.map!(&:downcase) | |
letters = [] | |
names.each do |name| | |
name.each_char_with_index do |letter, i| | |
letters[i] ||= {} | |
if letters[i][letter] | |
letters[i][letter] = letters[i][letter]+1 | |
else | |
letters[i][letter] = 1 | |
end | |
end | |
end | |
return letters | |
end | |
end | |
class Array | |
def average_of_words | |
letters = [] | |
map(&:downcase).each do |name| | |
name.each_char_with_index do |letter, i| | |
letters[i] ||= {} | |
if letters[i][letter] | |
letters[i][letter] = letters[i][letter]+1 | |
else | |
letters[i][letter] = 1 | |
end | |
end | |
end | |
letters.map{|b| b.to_a.sort_by{|c| c[1]}.last[0]}.join | |
end | |
def average_length | |
(inject(0){ |sum, el| sum + el.length }.to_f / size).round | |
end | |
def average | |
inject(0){ |sum, el| sum + el }.to_f / size | |
end | |
def with_frequency | |
map{|a| a.downcase}.inject(Hash.new(0)){|h,k| k.downcase!; h[k] += 1;h}.to_a.sort_by{|b| b[1]}.reverse | |
end | |
def shuffle! | |
size.downto(1) { |n| push delete_at(rand(n)) } | |
self | |
end | |
end | |
class String | |
# return character array of string with indices | |
def each_char_with_index | |
i = 0 | |
split(//).each do |c| | |
yield c, i | |
i += 1 | |
end | |
end | |
end | |
def string_em_together array | |
array = array.dup | |
array.shuffle! | |
str = array.shift | |
until array.select{|a| a.split.first == str.split.last}.empty? | |
add = array.select{|a| a.split.first == str.split.last}.first | |
array.delete add | |
str = str + ' ' + add.split.drop(1).join(' ') | |
array.shuffle! | |
end | |
return str | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment