Skip to content

Instantly share code, notes, and snippets.

@Ovid
Created March 28, 2017 12:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ovid/2c7a7a0371f18ad6019a44874d3d7750 to your computer and use it in GitHub Desktop.
Save Ovid/2c7a7a0371f18ad6019a44874d3d7750 to your computer and use it in GitHub Desktop.
Backup your old, dead LiveJournal entries
#!/usr/bin/env perl
use 5.18.0;
use warnings;
use autodie ":all";
use LWP::UserAgent;
use HTTP::Request::Common;
use HTML::TokeParser::Simple;
my $login_url = 'https://www.livejournal.com/login.bml?ret=1';
my $base_url = 'http://publius-ovidius.livejournal.com/'; # yeah, you want your own URL here
# I really should have put those in config files
my %formfields = (
user => 'YOUR USERNAME',
password => 'YOUR PASSWORD',
remember_me => 1,
ret => 1,
);
my $ua = LWP::UserAgent->new;
$ua->cookie_jar( { file => "$ENV{HOME}/.lj_cookies.txt" } );
$ua->protocols_allowed( [ 'http', 'https' ] );
my $login = $ua->request( POST $login_url, \%formfields );
unless ( $login->is_success ) {
die 'Could not log in: ' . $login_url->message;
}
# Get calendar pages and do a quick scan for links.
# XXX You'll probably want to update the start and end years manually
my @years = (2002 .. 2013);
YEAR: foreach my $year (@years) {
for my $month ( 1 .. 12 ) {
my $url = sprintf "$base_url/$year/%02d" => $month;
my $calendar = $ua->request( GET $url );
if ( $calendar->is_success ) {
save_entries( $ua, $year, $month, $calendar->content );
}
else {
warn "Could not fetch calendar for year $year: "
. $calendar->message;
}
}
}
sub save_entries {
my ( $ua, $year, $month, $content ) = @_;
return unless $content;
my $dir = sprintf "$year/%02d" => $month;
mkdir $year unless -d $year;
my $p = HTML::TokeParser::Simple->new( string => $content );
$p->unbroken_text(1);
my $link_num = 0;
while ( my $token = $p->get_token ) {
next
unless $token->is_start_tag('a')
and ( $token->get_attr('class') // '' ) eq 'j-day-subject-link';
$link_num++;
my $link = $token->get_attr('href');
do {
$token = $p->get_token;
} until $token->is_text;
my $slug = make_slug( $token->as_is );
$slug = sprintf "%04d-%s" => $link_num, $slug;
my $file = "$dir/$slug.html";
say "Fetching $file";
mkdir $dir unless -d $dir;
unless ( -f $file ) {
my $entry = $ua->request( GET $link);
unless ( $entry->is_success ) {
warn "Could not fetch link for $slug ($link): "
. $entry->message;
}
open my $fh, '>:encoding(UTF-8)', $file;
print {$fh} $entry->content;
sleep 1; # be nice to lj
}
}
}
sub make_slug {
my $text = lc shift;
$text =~ s/\s+/_/g;
$text =~ s/\W//g;
$text =~ s/_/-/g;
$text =~ s/^-|-$//g;
return $text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment