Skip to content

Instantly share code, notes, and snippets.

@sharifulin
Created August 8, 2011 09:51
Show Gist options
  • Save sharifulin/1131498 to your computer and use it in GitHub Desktop.
Save sharifulin/1131498 to your computer and use it in GitHub Desktop.
Export LJ posts to MongoDB using Mojolicious
#!/usr/bin/env perl
use common::sense;
use Mojo::IOLoop;
use Mojo::UserAgent;
use MongoDB;
use Data::Dumper;
my $ua = Mojo::UserAgent->new;
my $t = Mojo::IOLoop->trigger;
$ua->ioloop->connect_timeout(180);
$t ->ioloop->connect_timeout(180);
my $conn = MongoDB::Connection->new(host => 'localhost', port => 27017);
my $mdb = $conn->blog;
$mdb->posts->drop;
my($i, $count);
for (0..23) {
$t->begin;
my $skip = $_ * 10;
$ua->get("http://sharifulin.livejournal.com/?skip=$skip" => sub {
my $tx = pop;
my $from = $tx->req->url->to_string;
warn ++$i . " $from\n";
my $post;
my $page = $i;
for ( $tx->res->dom->at('#alpha-inner')->find('.entry')->each ) {
my $link = $_->at('.permalink')->attrs->{href};
my $hash = {
from => $from,
link => $link,
title => $_->at('.subj-link')->text,
tags => $_->at('.ljtags') ? [ map { $_->text } $_->at('.ljtags')->find('a')->each ] : [],
text => do {
my $t = $_->at('.entry-body')->content_xml;
$t =~ s{\s*<div class="ljtags">.*?</div>\s*}{}s;
$t;
},
};
$t->begin;
$ua->get($link => sub {
warn $page . ' ' . ++$post . " $link\n";
my $page = pop->res->dom->at('#content-wrapper');
# fuck LJ html!
$hash->{date} = do {
my $t = $page->at('font')->all_text;
$t =~ s/^@\s+//;
$t =~ s/ - /-/sg;
$t;
};
$hash->{text} = do {
my $t = $page->at('div[style="margin-left: 30px"]')->content_xml;
$t =~ s{^.*?</b></i></font><br />}{}s;
$t =~ s/^\s*|\s*$//s;
$t;
};
$count++ if $mdb->posts->insert($hash, {safe => 1});
$t->end;
});
}
$t->end;
});
}
$t->start;
say "Total: $count";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment