Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
scraper and word analysis tools for the somerville open studios site
#
# info at: http://the55.net/_12/sketch/open_studios
#
# run SomervilleScraper.new.get_artist_ids first. then :get_artist_data. then have fun.
#
URL = 'http://www.somervilleopenstudios.org/artists/artist_list.php?searchType=alpha'
DATA_PATH = File.dirname(__FILE__)
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require 'yaml'
module GenericScraperTools
def write_yaml name, data
File.open(yaml_file_path(name), 'w') { |f| f << data.to_yaml}
end
def read_yaml name
YAML::load(File.open(yaml_file_path(name)))
end
def yaml_file_path name
File.join(DATA_PATH, name.sub(/'.yml'/, '')+'.yml')
end
end
class SomervilleScraper
include GenericScraperTools
def initialize
end
def get_artist_ids
if page = Nokogiri::HTML(open(URL))
artist_ids = [].tap do |ids|
page.css('.ArtistRow2 a').map{|a| a.attribute('href').to_s}.each do |link|
next unless link.match(/artistID\=(\d+)/)
ids << $1
end
end
artist_ids.uniq!
write_yaml('artist_ids', *artist_ids)
end
return artist_ids
end
def artist_ids
@artist_ids ||= read_yaml('artist_ids')
end
def get_artist_page(id)
url = "http://www.somervilleopenstudios.org/artists/artist_profile.php?artistID=#{id}"
begin
Nokogiri::HTML(open(url))
rescue
puts "PAGE ERROR: #{url}"
end
end
def artist_data
@artist_data ||= read_yaml('artist_data')
end
def get_artist_data
puts "\n --- Artists to go: #{artist_ids.size} ---\n"
artist_data = []
artist_ids.each_with_index do |id, i|
if page = get_artist_page(id)
one_liner = page.css('#artist_content .ArtistSubTl').first.to_s.split(/<br>/)[0].split(/>/)[1].strip
statement = page.css('#artist_content td[width="285"] table tr:nth-child(2) td').first
statement.search('table').remove
statement.search('br').each do |n|
n.replace("\n")
end
statement = statement.content.strip
artist_data << {
:name => page.css('#artist_content h1').first.content.strip,
:one_liner => one_liner.gsub('&amp;', '&'),
:statement => statement
}
puts "#{i} #{artist_data.last[:name]}: #{artist_data.last[:one_liner]}"
end
end
write_yaml 'artist_data', artist_data
end
def one_liners
all_the :one_liner
end
def one_liner_words
one_liners.join(' ').downcase.split
end
def names
all_the :name
end
def statements
all_the :statement
end
def statement_words in_bits=0
words = statements.join(' ').
gsub(/(\n|"|\.|\/|,|\r|\302|\302|\224|\223|\(|\))/, ' ').
gsub(/\222/,"'").
downcase.split
if in_bits.to_i > 0
words.each_with_index.map{|x,i| "#{x} #{words[i+1, in_bits-1]*' '}"}
else
words
end
end
def all_the param
artist_data.map{|artist| artist[param]}
end
def average_name
names.map!(&:downcase)
letters = []
names.each do |name|
name.each_char_with_index do |letter, i|
letters[i] ||= {}
if letters[i][letter]
letters[i][letter] = letters[i][letter]+1
else
letters[i][letter] = 1
end
end
end
return letters
end
end
class Array
def average_of_words
letters = []
map(&:downcase).each do |name|
name.each_char_with_index do |letter, i|
letters[i] ||= {}
if letters[i][letter]
letters[i][letter] = letters[i][letter]+1
else
letters[i][letter] = 1
end
end
end
letters.map{|b| b.to_a.sort_by{|c| c[1]}.last[0]}.join
end
def average_length
(inject(0){ |sum, el| sum + el.length }.to_f / size).round
end
def average
inject(0){ |sum, el| sum + el }.to_f / size
end
def with_frequency
map{|a| a.downcase}.inject(Hash.new(0)){|h,k| k.downcase!; h[k] += 1;h}.to_a.sort_by{|b| b[1]}.reverse
end
def shuffle!
size.downto(1) { |n| push delete_at(rand(n)) }
self
end
end
class String
# return character array of string with indices
def each_char_with_index
i = 0
split(//).each do |c|
yield c, i
i += 1
end
end
end
def string_em_together array
array = array.dup
array.shuffle!
str = array.shift
until array.select{|a| a.split.first == str.split.last}.empty?
add = array.select{|a| a.split.first == str.split.last}.first
array.delete add
str = str + ' ' + add.split.drop(1).join(' ')
array.shuffle!
end
return str
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.