Created
March 28, 2017 12:05
-
-
Save Ovid/2c7a7a0371f18ad6019a44874d3d7750 to your computer and use it in GitHub Desktop.
Backup your old, dead LiveJournal entries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.18.0; | |
use warnings; | |
use autodie ":all"; | |
use LWP::UserAgent; | |
use HTTP::Request::Common; | |
use HTML::TokeParser::Simple; | |
my $login_url = 'https://www.livejournal.com/login.bml?ret=1'; | |
my $base_url = 'http://publius-ovidius.livejournal.com/'; # yeah, you want your own URL here | |
# I really should have put those in config files | |
my %formfields = ( | |
user => 'YOUR USERNAME', | |
password => 'YOUR PASSWORD', | |
remember_me => 1, | |
ret => 1, | |
); | |
my $ua = LWP::UserAgent->new; | |
$ua->cookie_jar( { file => "$ENV{HOME}/.lj_cookies.txt" } ); | |
$ua->protocols_allowed( [ 'http', 'https' ] ); | |
my $login = $ua->request( POST $login_url, \%formfields ); | |
unless ( $login->is_success ) { | |
die 'Could not log in: ' . $login_url->message; | |
} | |
# Get calendar pages and do a quick scan for links. | |
# XXX You'll probably want to update the start and end years manually | |
my @years = (2002 .. 2013); | |
YEAR: foreach my $year (@years) { | |
for my $month ( 1 .. 12 ) { | |
my $url = sprintf "$base_url/$year/%02d" => $month; | |
my $calendar = $ua->request( GET $url ); | |
if ( $calendar->is_success ) { | |
save_entries( $ua, $year, $month, $calendar->content ); | |
} | |
else { | |
warn "Could not fetch calendar for year $year: " | |
. $calendar->message; | |
} | |
} | |
} | |
sub save_entries { | |
my ( $ua, $year, $month, $content ) = @_; | |
return unless $content; | |
my $dir = sprintf "$year/%02d" => $month; | |
mkdir $year unless -d $year; | |
my $p = HTML::TokeParser::Simple->new( string => $content ); | |
$p->unbroken_text(1); | |
my $link_num = 0; | |
while ( my $token = $p->get_token ) { | |
next | |
unless $token->is_start_tag('a') | |
and ( $token->get_attr('class') // '' ) eq 'j-day-subject-link'; | |
$link_num++; | |
my $link = $token->get_attr('href'); | |
do { | |
$token = $p->get_token; | |
} until $token->is_text; | |
my $slug = make_slug( $token->as_is ); | |
$slug = sprintf "%04d-%s" => $link_num, $slug; | |
my $file = "$dir/$slug.html"; | |
say "Fetching $file"; | |
mkdir $dir unless -d $dir; | |
unless ( -f $file ) { | |
my $entry = $ua->request( GET $link); | |
unless ( $entry->is_success ) { | |
warn "Could not fetch link for $slug ($link): " | |
. $entry->message; | |
} | |
open my $fh, '>:encoding(UTF-8)', $file; | |
print {$fh} $entry->content; | |
sleep 1; # be nice to lj | |
} | |
} | |
} | |
sub make_slug { | |
my $text = lc shift; | |
$text =~ s/\s+/_/g; | |
$text =~ s/\W//g; | |
$text =~ s/_/-/g; | |
$text =~ s/^-|-$//g; | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment