Skip to content

Instantly share code, notes, and snippets.

@c18t
Last active December 19, 2015 01:08
Show Gist options
  • Save c18t/5873155 to your computer and use it in GitHub Desktop.
Save c18t/5873155 to your computer and use it in GitHub Desktop.
crawler for twitter api v1.1. crawl_settings.pl で設定ファイル生成して crawl_insert.pl でDBに突っ込む
#!/usr/local/bin/perl
use 5.014;
use warnings;
use utf8;
use lib qw(lib);
use autobox;
use autobox::dump;
use autobox::Core;
use Data::Lock qw(dlock);
dlock my $CHARSET = ($^O eq 'MSWin32' ? 'cp932' : 'utf8');
binmode STDIN => ":encoding($CHARSET)";
binmode STDOUT => ":encoding($CHARSET)";
use Uc::Model::Twitter;
use YAML qw(LoadFile);
use Config::Pit;
use Getopt::Long;
use DBIx::QueryLog ();
use Math::BigInt;
require "twitter_agent.pl";
local $| = 1;
my ( $file, $debug, $dbdebug, $help ) = ('') x 4;
my $result = GetOptions(
"f|file=s" => \$file,
"d|debug" => \$debug,
"dbd|dbdebug" => \$dbdebug,
"h|help|?" => \$help,
);
my $prof = shift;
warn <<"_HELP_" and exit if $help or !$file or !$prof;
Usage: $0 -f (setting yaml file) [-d [show debug message] -dbd [show sql debug message] -h [help]] (conf_name)
_HELP_
my $utig = pit_get('utig.pl', require => {
consumer_key => '',
consumer_secret => '',
});
my $config = pit_get("utig.pl.$prof");
my $nt = twitter_agent($utig, $config);
pit_set("utig.pl.$prof", data => $config) if $nt->{config_updated};
my $mysql = pit_get('mysql', require => {
user => '',
pass => '',
});
DBIx::QueryLog->enable if $dbdebug;
my $schema = Uc::Model::Twitter->new( connect_info => ['dbi:mysql:twitter', $mysql->{user}, $mysql->{pass}, {
mysql_enable_utf8 => 1,
on_connect_do => ['set names utf8mb4'],
}]
);
my $data = LoadFile($file);
$data->for( sub{
my ($index, $object, $self) = @_;
crawl_insert($object);
} );
exit;
sub crawl_insert {
my $object = shift;
my ($method, $user_id) = @{$object}{qw/method user_id/};
my ($limit, $params);
if ($method eq 'favorites') {
$limit = 15;
$params = { count => 200, include_entities => 0, user_id => $user_id };
}
else {
$limit = 180;
$params = { count => 200, include_rts => 1, user_id => $user_id };
}
my $wait = int(15*60 / $limit) + 1;
my $size = length "$wait";
my $max_id = -1;
my $last_max_id = -1;
my $retry = 0;
my $count = 0;
my $sleep = 0;
RETRY: eval {
$params->{max_id} = sprintf "%d", $last_max_id-1 if $last_max_id != -1;
print "crawling $user_id\'s $method\:$last_max_id ... "; print "with ", $params->perl if $debug;
my $res = $nt->$method($params); say "done. got @{[ $count = scalar @$res ]} tweets.";
my $txn = $schema->txn_scope;
while (my $q = $res->shift) {
say "tweet->id < max_id: $q->{id} < $max_id" if $debug;
$max_id = Math::BigInt->new($q->{id}.'') if $max_id == -1 or $q->{id} < $max_id;
$schema->find_or_create_status_from_tweet(
$q,
{ user_id => $config->{user_id}, ignore_remark_disabling => 1 }
);
}
$txn->commit;
};
say "last_max_id != max_id: $last_max_id != $max_id" if $debug;
say "try error: $@" if $@;
$sleep = $wait;
printf("wait %${size}d sec.\r", $sleep--), sleep 1 while $sleep > 0;
if ($@) {
die if $retry++ > 5;
goto RETRY;
}
elsif ($count && $last_max_id != $max_id) {
$retry = 0;
$last_max_id = $max_id;
goto RETRY;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment