Created
February 15, 2019 03:30
-
-
Save JohnMertz/080347a57a4880700170ea8ecfa241e9 to your computer and use it in GitHub Desktop.
OCTranspo Alert Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use WWW::Mechanize; | |
use Data::Dump qw/ dump /; | |
use JSON::Any; | |
# Define the routes that I actually care about | |
my @routes = qw| |; | |
# Set up necessary variables and the Mechanize object | |
my $mech = WWW::Mechanize->new(autocheck => 1, cookie_jar => {}, agent => 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; MDDRJS; rv:11.0) like Gecko'); | |
my (%alerts,$html,@lines,$period,$type,$route,$notice); | |
# Fetch both the 'today' and 'week' updates | |
foreach my $period ( qw| today week | ) { | |
$alerts{$period} = {}; | |
$mech->get('http://www.octranspo.com/updates-' . $period); | |
$html = $mech->content(); | |
@lines = split '\n', $html; | |
$type = ''; | |
foreach (@lines) { | |
# If the type is defined, this means that I've already identified the start of the 'Cancelled', 'Delayed', or 'Detours' heading. | |
if ($type) { | |
# Check to see if I've hit a new heading. | |
if ($_ =~ m/^<h2>.*Icon/) { | |
$type = $_; | |
$type =~ s/^.*alt="([^TI]*) (Trips )?Icon".*$/${1}/; | |
# If not a new heading, look for a route heading. | |
} elsif ($_ =~ m/h3 class="accordion_header/) { | |
$route = $_; | |
$route =~ s/.*class="detail">([^<]*)<.*/${1}/; | |
# If routes have been listed, only add those | |
if (scalar @routes) { | |
my $found = 0; | |
foreach (@routes) { | |
if ($route =~ m/Route ([0-9]+\, )*$_\b/) { | |
$found = 1; | |
} | |
} | |
# If it is not a route I care about, forget it | |
if (!$found) { | |
$route = ''; | |
} | |
} | |
# If there is a route stored, I am expecting to see the details for that route next | |
} elsif ($route && $_ =~ m/<\/p>$/) { | |
$notice = $_; | |
# Clear the HTML out of the line | |
if ($notice =~ m/<a href="/) { | |
$notice =~ s/^\t+([^<]*)<a href="[^"]*">([^<]*)<\/a>([^<]*)<\/p>$/${1}${2}${3}/; | |
} else { | |
$notice =~ s/^\t+([^<]*)<\/p>$/${1}/; | |
} | |
# Record the alert | |
$alerts{$period}{$type}{$route} = $notice; | |
# Reset the route so that I don't look for more details where there shouldn't be any. | |
$route = ''; | |
} | |
# Prior to hitting the first heading, that is the only thing I'm looking for. Either capture it or jump to the next line | |
} else { | |
if ($_ =~ m/^<h2>.*Icon/) { | |
$type = $_; | |
$type =~ s/^.*alt="([^TI]*) (Trips )?Icon".*$/${1}/; | |
} | |
} | |
} | |
} | |
my $json = JSON::Any->new(pretty => 1); | |
print $json->encode(\%alerts); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Scrapes for route cancellations, delays and detours. Prints the results as pretty JSON. To restrict results to select routs, simply add the desired route numbers between the pipes on line 11.
Note: The Data::Dump dependency is not actually necessary, I just forgot to remove it.