Last active
December 19, 2015 01:08
-
-
Save c18t/5873155 to your computer and use it in GitHub Desktop.
crawler for twitter api v1.1. crawl_settings.pl で設定ファイル生成して crawl_insert.pl でDBに突っ込む
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
use 5.014; | |
use warnings; | |
use utf8; | |
use lib qw(lib); | |
use autobox; | |
use autobox::dump; | |
use autobox::Core; | |
use Data::Lock qw(dlock); | |
dlock my $CHARSET = ($^O eq 'MSWin32' ? 'cp932' : 'utf8'); | |
binmode STDIN => ":encoding($CHARSET)"; | |
binmode STDOUT => ":encoding($CHARSET)"; | |
use Uc::Model::Twitter; | |
use YAML qw(LoadFile); | |
use Config::Pit; | |
use Getopt::Long; | |
use DBIx::QueryLog (); | |
use Math::BigInt; | |
require "twitter_agent.pl"; | |
local $| = 1; | |
my ( $file, $debug, $dbdebug, $help ) = ('') x 4; | |
my $result = GetOptions( | |
"f|file=s" => \$file, | |
"d|debug" => \$debug, | |
"dbd|dbdebug" => \$dbdebug, | |
"h|help|?" => \$help, | |
); | |
my $prof = shift; | |
warn <<"_HELP_" and exit if $help or !$file or !$prof; | |
Usage: $0 -f (setting yaml file) [-d [show debug message] -dbd [show sql debug message] -h [help]] (conf_name) | |
_HELP_ | |
my $utig = pit_get('utig.pl', require => { | |
consumer_key => '', | |
consumer_secret => '', | |
}); | |
my $config = pit_get("utig.pl.$prof"); | |
my $nt = twitter_agent($utig, $config); | |
pit_set("utig.pl.$prof", data => $config) if $nt->{config_updated}; | |
my $mysql = pit_get('mysql', require => { | |
user => '', | |
pass => '', | |
}); | |
DBIx::QueryLog->enable if $dbdebug; | |
my $schema = Uc::Model::Twitter->new( connect_info => ['dbi:mysql:twitter', $mysql->{user}, $mysql->{pass}, { | |
mysql_enable_utf8 => 1, | |
on_connect_do => ['set names utf8mb4'], | |
}] | |
); | |
my $data = LoadFile($file); | |
$data->for( sub{ | |
my ($index, $object, $self) = @_; | |
crawl_insert($object); | |
} ); | |
exit; | |
sub crawl_insert { | |
my $object = shift; | |
my ($method, $user_id) = @{$object}{qw/method user_id/}; | |
my ($limit, $params); | |
if ($method eq 'favorites') { | |
$limit = 15; | |
$params = { count => 200, include_entities => 0, user_id => $user_id }; | |
} | |
else { | |
$limit = 180; | |
$params = { count => 200, include_rts => 1, user_id => $user_id }; | |
} | |
my $wait = int(15*60 / $limit) + 1; | |
my $size = length "$wait"; | |
my $max_id = -1; | |
my $last_max_id = -1; | |
my $retry = 0; | |
my $count = 0; | |
my $sleep = 0; | |
RETRY: eval { | |
$params->{max_id} = sprintf "%d", $last_max_id-1 if $last_max_id != -1; | |
print "crawling $user_id\'s $method\:$last_max_id ... "; print "with ", $params->perl if $debug; | |
my $res = $nt->$method($params); say "done. got @{[ $count = scalar @$res ]} tweets."; | |
my $txn = $schema->txn_scope; | |
while (my $q = $res->shift) { | |
say "tweet->id < max_id: $q->{id} < $max_id" if $debug; | |
$max_id = Math::BigInt->new($q->{id}.'') if $max_id == -1 or $q->{id} < $max_id; | |
$schema->find_or_create_status_from_tweet( | |
$q, | |
{ user_id => $config->{user_id}, ignore_remark_disabling => 1 } | |
); | |
} | |
$txn->commit; | |
}; | |
say "last_max_id != max_id: $last_max_id != $max_id" if $debug; | |
say "try error: $@" if $@; | |
$sleep = $wait; | |
printf("wait %${size}d sec.\r", $sleep--), sleep 1 while $sleep > 0; | |
if ($@) { | |
die if $retry++ > 5; | |
goto RETRY; | |
} | |
elsif ($count && $last_max_id != $max_id) { | |
$retry = 0; | |
$last_max_id = $max_id; | |
goto RETRY; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment