Created
March 13, 2022 14:45
-
-
Save doriantaylor/80cf11f8fd6b9ef1a3dcec8dd232341b to your computer and use it in GitHub Desktop.
scrape university of toronto pool schedule
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'date' | |
require 'time' | |
require 'mechanize' | |
# do this or it complains | |
require 'active_support/isolated_execution_state' | |
# wah wah wah crybaby | |
require 'icalendar' | |
require 'icalendar/tzinfo' | |
require 'icalendar-recurrence' | |
class Scrapinator | |
private | |
def scrape_table table | |
table.xpath('tbody/tr').map do |tr| | |
tr.elements.map do |td| | |
text = td.text.gsub(/\u{a0}+/, ' ').strip | |
text.empty? ? nil : text | |
end | |
end | |
end | |
def convert_arrays tables | |
# { "Weekday" => [ [start, end, description, location], ... ] } | |
out = {} | |
tables.each do |t| | |
# skip unless the table has four cells | |
next unless t.first and t.first.length == 4 | |
today = nil | |
t.each do |row| | |
# a row of all nulls signifies a new day, and subsquent days are | |
# in the first column *next* to the first entry in columns 2-4 | |
next if row.all? &:nil? | |
# the first row has the day in the first column and labels for | |
# subsequent columns which we can discard | |
has_data = true | |
unless row.first.nil? | |
has_data = false unless today | |
today = row.first | |
out[today] ||= [] | |
end | |
if has_data | |
time, desc, location = row.drop 1 | |
start, finish = time.split(?-).map(&:strip) | |
unless /[ap]m/i.match?(start) | |
if m = /([ap]m)/i.match(finish) | |
start << m.captures.first | |
end | |
end | |
out[today] << [start, finish, desc, location] | |
end | |
end | |
end | |
out | |
end | |
def make_ical basis, struct | |
ical = Icalendar::Calendar.new | |
tz = TZInfo::Timezone.get @tzid | |
timezone = tz.ical_timezone DateTime.now | |
ical.add_timezone timezone | |
(0..6).each do |i| | |
day = (basis + i).strftime '%A' | |
if struct[day] | |
struct[day].each do |record| | |
start, finish, desc, location = record | |
start, finish = [start, finish].map { |t| Time.parse(t, basis + i) } | |
ical.event do |e| | |
e.dtstart = start | |
e.dtend = finish | |
e.summary = desc | |
e.location = location | |
e.rrule = 'FREQ=WEEKLY;COUNT=12' | |
end | |
end | |
end | |
end | |
ical | |
end | |
public | |
def initialize tz | |
@tzid = tz | |
@mech = Mechanize.new | |
end | |
def run basis, url | |
# fetch the webpage | |
page = @mech.get url | |
# find the stupid tables | |
tables = page.search('//table').map { |t| scrape_table t } | |
warn tables.inspect | |
# turn them into a struct | |
struct = convert_arrays tables | |
# punt out ical file | |
make_ical basis, struct | |
end | |
end | |
if $0 == __FILE__ | |
basis = Date.new 2022, 1, 31 | |
url = 'https://kpe.utoronto.ca/sport-recreationrecreational-workouts-activitiesdrop-sports-activities/lane-swimming' | |
ical = Scrapinator.new('America/Toronto').run basis, url | |
ical.publish | |
print ical.to_ical | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment