Created
May 10, 2020 14:21
-
-
Save mala/985e5b6404ebb75011544a75ac902d90 to your computer and use it in GitHub Desktop.
一個前の記事用に書いた Twitter Profile 収集するやつ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use Net::Twitter; | |
use MongoDB; | |
my $nt = Net::Twitter->new( | |
traits => [qw/API::RESTv1_1/], | |
consumer_key => "xxx", | |
consumer_secret => "xxx", | |
access_token => "xxx", | |
access_token_secret => "xxx", | |
); | |
use Data::Dumper; | |
use Date::Parse; | |
use List::Util qw(uniqstr); | |
use boolean; | |
# @normal = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->find({verified => false, "status.lang" => "ja"})->all; scalar @normal; | |
# scalar map { join "\t", @{$_} } sort { $b->[0] <=> $a->[0] } grep { $tmp = $_->[1]; $tmp=~m/([\x{3000}\x{2800}_\x{ff3f}\s]+\w{1}){3,}/ } map { [$_->{followers_count}, $_->{name}, $_->{description}] } @official | |
# warn join "\n", map { join "\t", @{$_} } sort { $b->[0] <=> $a->[0] } grep { $_->[1]=~m/([\x{3000}\s]+\w{1,2}){3,}/ } map { [$_->{followers_count}, $_->{name}] } @all | |
sub crawl_profile { | |
my @ids = get_cached_ids(); | |
my @exists = get_crawled_ids(); | |
my %e = map { $_ => 1 } @exists; | |
warn "seed: " . scalar @ids; | |
warn "crawled: " . scalar @exists; | |
my @to_crawl = grep { !$e{$_} } @ids; | |
lookup_ids([@to_crawl], 1); | |
} | |
sub crawl_seed { | |
my @list = search_influencer(); | |
my @sorted = sort { $b->[1] <=> $a->[1] } @list; | |
warn Dumper @sorted; | |
for (@sorted) { | |
search_friends($_->[0]); | |
sleep 60; | |
} | |
} | |
# crawl_seed(); | |
crawl_profile(); | |
exit; | |
sub get_cached_ids { | |
# my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache")->distinct("data.ids", {})->all; | |
my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache")->aggregate([ | |
{'$unwind' => '$data.ids'}, | |
{'$project' => { _id => 0, 'data.ids' => 1 }}, | |
{'$group' => { _id => '$data.ids'}} | |
], {allowDiskUse => true})->all; | |
return map { $_->{_id} } @all_ids; | |
} | |
sub get_crawled_ids { | |
# my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->distinct("id_str", {})->all; | |
my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->aggregate([ | |
{'$project' => { _id => 0, 'id_str' => 1 }}, | |
{'$group' => { _id => '$id_str'}} | |
], {allowDiskUse => true})->all; | |
return map { $_->{_id} } @all_ids; | |
} | |
sub search_influencer { | |
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter"); | |
my @result = $db->find({})->all; | |
my $last = time - (60 * 60 * 24 * 60); | |
my @pairs = map { | |
[$_->{id_str}, $_->{friends_count}, str2time($_->{status}->{created_at}), $_->{followers_count}, $_->{status}->{lang}, $_->{screen_name}] | |
} @result; | |
warn Dumper @pairs; | |
my @list = grep { | |
$_->[2] > $last | |
&& $_->[1] > 3000 | |
&& $_->[1] < 10000 | |
&& $_->[3] > 3000 | |
&& $_->[4] eq "ja"; | |
} @pairs; | |
@list; | |
} | |
sub search_ids { | |
my $res = $nt->friends_ids({screen_name => "bulkneets"}); | |
warn Dumper $res->{ids}; | |
return $res->{ids}; | |
} | |
sub search_friends { | |
my $target_id = shift; | |
my $res = get_cache("friends_$target_id"); | |
return $res->{ids} if $res; | |
$res = $nt->friends_ids({user_id => $target_id}); | |
set_cache("friends_$target_id", $res); | |
sleep 5; | |
return $res->{ids}; | |
} | |
sub get_cache { | |
my $key = shift; | |
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache"); | |
my ($data) = $db->find({cache_key => $key})->all; | |
warn "get cache for $key"; | |
if ($data) { | |
warn "hit cache for $key"; | |
# warn Dumper $data; | |
return $data->{data}; | |
} | |
} | |
sub set_cache { | |
my ($key, $data) = @_; | |
my $obj = {cache_key => $key, data => $data}; | |
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache"); | |
warn "set cache for $key"; | |
$db->replace_one({cache_key => $key}, $obj, {upsert => 1}); | |
} | |
sub lookup_ids { | |
my $ids = shift; | |
my $skip_check = shift; | |
my $id_to_lookup = []; | |
for my $id (@$ids) { | |
if (!$skip_check) { | |
my $user = lookup_profile($id); | |
if ($user) { warn "exists $id"; next; } | |
} | |
push @{$id_to_lookup}, $id; | |
if (@{$id_to_lookup} >= 100) { | |
my $res = bulk_lookup($id_to_lookup); | |
save_profiles($res); | |
$id_to_lookup = []; | |
} | |
} | |
my $res = bulk_lookup($id_to_lookup); | |
save_profiles($res); | |
$id_to_lookup = []; | |
} | |
sub bulk_lookup { | |
sleep 1; | |
warn "lookup users .."; | |
my $id_to_lookup = shift; | |
if (scalar @$id_to_lookup) { | |
return $nt->lookup_users({user_id => $id_to_lookup }); | |
} else { | |
return []; | |
} | |
} | |
sub lookup_profile { | |
my $id = shift; | |
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter"); | |
my $user = $db->find({id_str => "$id"})->all; | |
} | |
sub save_profiles { | |
my $users = shift; | |
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter"); | |
for my $user (@$users) { | |
warn "insert to mongo"; | |
# warn Dumper $user; | |
$db->insert_one($user); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment