Last active
August 29, 2015 14:01
-
-
Save RogerDodger/4c6300696129dff8dbf5 to your computer and use it in GitHub Desktop.
Grabs stories from writeoff.me and saves them to disk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Scrape writeoff.me events for stories and write them to files | |
use Mojo::Base -strict; | |
use Mojo::UserAgent; | |
use File::Spec; | |
# IDs of events to scrape | |
my @events = (1..23); | |
# Directory to write stories to | |
my $root = 'writeoff'; | |
mkdir $root unless -d $root; | |
my $ua = Mojo::UserAgent->new(max_redirects => 2); | |
for my $eid (@events) { | |
my $tx = $ua->get("http://writeoff.me/event/$eid/fic/gallery"); | |
if (my $res = $tx->success) { | |
# Event exists, make sure dir exists to write stories to | |
# | |
# The URL is different from the one we requested because there was a redirect, | |
# e.g., /event/23/... -> /event/23-One-Little-Mistake/... | |
# We want to extract the "23-One-Little-Mistake" to use as the dir name. | |
my ($id_uri) = $tx->req->url =~ m{/ (\d+ - [^/]+) /}x; | |
my $dir = File::Spec->catfile($root, $id_uri); | |
mkdir $dir unless -d $dir; | |
say "Downloading stories from $id_uri..."; | |
# Iterate through all the "txt" links (there's one for each story) and | |
# write their responses to files | |
$res->dom('a')->each(sub { | |
my $e = shift; | |
return unless $e->text eq 'txt'; | |
my $filename = $e->attr('href') =~ s{.+/}{}r; | |
my $tx = $ua->get($e->attr('href')); | |
if (my $res = $tx->success) { | |
open my $fh, '>', File::Spec->catfile($dir, $filename); | |
print $fh $res->body; | |
close $fh; | |
} | |
else { | |
say $tx->error; | |
} | |
}); | |
} | |
else { | |
say $tx->error; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment