Skip to content

Instantly share code, notes, and snippets.

@nkmrtkhd
Forked from nkmrgk/googlehistory.pl
Created April 2, 2012 14:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nkmrtkhd/2283935 to your computer and use it in GitHub Desktop.
Save nkmrtkhd/2283935 to your computer and use it in GitHub Desktop.
Downloader for Google history
#!/usr/bin/perl
#
# googlehistory.pl - Downloader for Google history
#
use strict;
use warnings;
use Encode;
use utf8;
use URI;
use HTTP::Cookies;
use WWW::Mechanize;
use HTML::Entities;
my $max_page = 0;
my $wait_sec = 1;
my $account = 'your-google-account';
my $password;
print "input pass:";
system "stty -echo";
chomp($password = <STDIN>);
system "stty echo";
print "\n";
my $tsv = 'googlehistory.tsv';
my $cache = 'googlehistory.cache_cookie';
my $cache_expire = 60*60;
my $http_timeout = 300;
my $login_max_retry = 3;
my $verbose = 1;
my $console_encoding = 'cp932';
sub get {
my ($uri) = @_;
my $mech = WWW::Mechanize->new(timeout => $http_timeout);
my $cookie = HTTP::Cookies->new(file => $cache, autosave => 1);
$mech->agent_alias('Windows IE 6');
my $content;
for (my $i=1; $i<=$login_max_retry; $i++) {
if (-f $cache && time - scalar((stat($cache))[9]) < $cache_expire) {
$mech->cookie_jar($cookie);
}
else {
unlink $cache;
$mech->cookie_jar($cookie);
$mech->get('https://www.google.co.jp/accounts/Login');
$mech->form_number(1);
$mech->field(Email => $account);
$mech->field(Passwd => $password);
$mech->click();
}
warn "GET $uri\n";
my $res;
eval { $res = $mech->get($uri) };
die $@ if ($@);
$content = $res->decoded_content;
Encode::_utf8_off($content);
if ($content !~ /bkmk_href_/) {
die "Login failed $i times." if ($i == $login_max_retry);
warn "Login retry $i\n";
unlink $cache;
next;
}
# Success
last;
}
return $content;
}
sub parse {
my ($content) = @_;
my $day = '';
my $lines = '';
my $nextlink;
for (split(/<\/div>/, $content)) {
if (m|<h1>(.*?)</h1>|) {
$day = $1;
$day =~ s/&nbsp;//;
$day =~ s/\(.*?\)//g;
}
if (m|href="(.*?)".*? id="bkmk_href_">(.*?)</a>.*?(\d\d:\d\d)</td>|) {
my ($link, $title, $hm) = ($1, $2, $3);
$link = decode_entities($link);
$lines .= join "\t", ($day, $hm, $title, $link);
$lines .= "\n";
if ($verbose) {
$| = 1;
print encode($console_encoding,
decode('utf8', "$day $hm $title\n"));
}
}
if (m|title="(.*?)".*? id="bkmk_href_([\d\-]+)">(.*?)</a>|) {
my ($link, $bkmk, $title) = ($1, $2, $3);
$link = decode_entities($link);
$lines .= join "\t", ($day, $bkmk, $title, $link);
$lines .= "\n";
}
if (m|<a class="kd-button" href="(\./lookup\?hl=ja&max=\d+)">|) {
$nextlink = $1;
$nextlink =~ s|.*lookup|https://www.google.com/history/lookup|;
last;
}
}
return ($lines, $nextlink);
}
sub crowl_history {
my ($uri) = @_;
my $content = get($uri);
my ($lines, $nextlink) = parse($content);
open my $add, '>>', $tsv or die $!;
print $add $lines;
close $add;
return $nextlink;
}
sub main {
my ($uri) = @_;
my $default_uri = 'https://www.google.com/history/lookup?hl=ja';
$uri = $default_uri if (! $uri);
for (my $i=1; $max_page ? $i<=$max_page : 1; $i++) {
my $nextlink = crowl_history($uri);
last if !$nextlink;
$uri = $nextlink;
sleep($wait_sec);
}
}
my $uri = shift @ARGV;
main($uri);
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment