Created
November 10, 2013 14:20
-
-
Save xtetsuji/7398853 to your computer and use it in GitHub Desktop.
The Interviews article exporter and downloader.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# Initial release by @xtetsuji at 2013/11/10 | |
# See following document for detail. | |
use strict; | |
use warnings; | |
use utf8; | |
use LWP::UserAgent; | |
use Web::Query; | |
binmode STDOUT, ':utf8'; | |
binmode STDERR, ':utf8'; | |
my $VERSION = "0.01"; | |
my $DATE_REVISION = "2013-11-10"; | |
my $username = shift; | |
if ( !$username ) { | |
die "username is required\n"; | |
} | |
if ( $username =~ /[\W-]/ ) { | |
die "username is required as alphabet and numeric and _ and -.\n"; | |
} | |
my $ua = LWP::UserAgent->new( | |
agent => "TheInterviews Exporter/$VERSION/$DATE_REVISION", | |
); | |
my $endpoind_url = "http://theinterviews.jp"; | |
my $base_url = "$endpoind_url/$username"; | |
my $document = wq($base_url); | |
my @article; | |
my $log_cb = sub { | |
my $i = shift; | |
my $data = {}; | |
#print $i . ": " . $_->find("h1")->text . "\n"; | |
$data->{id} = $_->attr("id"); | |
$data->{title} = $_->find("h1")->text; | |
$data->{photo_url} = $_->find(".photo img")->attr("src"); | |
download($data->{photo_url}, $data->{id}) if $data->{photo_url}; | |
$data->{photo_caption} = $_->find(".photo p.caption")->text; | |
$data->{publish_date} = $_->find(".publish_date")->text; | |
s/^\s+//, | |
s/\s+$// for $data->{publish_date}; | |
$_->find(".photo")->remove; | |
$_->find(".publish_date")->remove; | |
$data->{article} = $_->find(".note")->html; | |
s{<br />}{\n}g, s/…/…/g, s/^ //gm, | |
s/&#x([0-9A-F]{4});/ chr hex $1 /egi for $data->{article}; | |
push @article, $data; | |
}; | |
$document->find(".log")->each($log_cb); | |
# $next_href eq '/:username/page/:number' | |
while ( my $next_href = $document->find(q{a[rel="next"]})->attr("href") ) { | |
my $page_url = $endpoind_url . $next_href; | |
$document = wq($page_url); # previous $document having object is trashed. | |
$document->find(".log")->each($log_cb); | |
} | |
# output: plain text mode. | |
for my $data (@article) { | |
my $content = ''; | |
$content .= "Title: $data->{title}\n"; | |
$content .= "Photo-Caption: " . ($data->{photo_caption} || '') . "\n"; | |
$content .= "Photo-URL: " . ($data->{photo_url} || '') . "\n"; | |
$content .= "Publish-Date: $data->{publish_date}\n"; | |
$content .= "Article-Id: $data->{id}\n"; | |
$content .= "\n"; | |
$content .= $data->{article}; | |
writefile($data->{id} . ".txt", $content); | |
} | |
sub download { | |
my ($url, $id) = @_; | |
if ( !$url || !$id ) { | |
die "download requires url and id."; | |
} | |
my ($ext) = $url =~ /\.(\w+)$/; | |
my $res = $ua->get($url); | |
open my $fh, '>:raw', "$id.$ext"; | |
print {$fh} $res->content; | |
} | |
sub writefile { | |
my ($file, $content) = @_; | |
open my $fh, '>:utf8', $file; | |
print {$fh} $content; | |
return 1; | |
} | |
__END__ | |
=pod | |
=encoding utf-8 | |
=head1 NAME | |
ti-export - The Interviews article exporter and downloader. | |
=head1 SYNOPSIS | |
perl ti-export USERNAME | |
# downloaded *.txt and *.jpg at current directory. | |
=head1 DESCRIPTION | |
(stub) | |
=head1 COPYRIGHT AND LICENSE | |
Copyright (C) 2013 by OGATA Tetsuji | |
This library is free software; you can redistribute it and/or modify | |
it under the same terms as Perl itself. | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment