Skip to content

Instantly share code, notes, and snippets.

@mala
Created May 10, 2020 14:21
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mala/985e5b6404ebb75011544a75ac902d90 to your computer and use it in GitHub Desktop.
Save mala/985e5b6404ebb75011544a75ac902d90 to your computer and use it in GitHub Desktop.
一個前の記事用に書いた Twitter Profile 収集するやつ
use strict;
use Net::Twitter;
use MongoDB;
my $nt = Net::Twitter->new(
traits => [qw/API::RESTv1_1/],
consumer_key => "xxx",
consumer_secret => "xxx",
access_token => "xxx",
access_token_secret => "xxx",
);
use Data::Dumper;
use Date::Parse;
use List::Util qw(uniqstr);
use boolean;
# @normal = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->find({verified => false, "status.lang" => "ja"})->all; scalar @normal;
# scalar map { join "\t", @{$_} } sort { $b->[0] <=> $a->[0] } grep { $tmp = $_->[1]; $tmp=~m/([\x{3000}\x{2800}_\x{ff3f}\s]+\w{1}){3,}/ } map { [$_->{followers_count}, $_->{name}, $_->{description}] } @official
# warn join "\n", map { join "\t", @{$_} } sort { $b->[0] <=> $a->[0] } grep { $_->[1]=~m/([\x{3000}\s]+\w{1,2}){3,}/ } map { [$_->{followers_count}, $_->{name}] } @all
sub crawl_profile {
my @ids = get_cached_ids();
my @exists = get_crawled_ids();
my %e = map { $_ => 1 } @exists;
warn "seed: " . scalar @ids;
warn "crawled: " . scalar @exists;
my @to_crawl = grep { !$e{$_} } @ids;
lookup_ids([@to_crawl], 1);
}
sub crawl_seed {
my @list = search_influencer();
my @sorted = sort { $b->[1] <=> $a->[1] } @list;
warn Dumper @sorted;
for (@sorted) {
search_friends($_->[0]);
sleep 60;
}
}
# crawl_seed();
crawl_profile();
exit;
sub get_cached_ids {
# my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache")->distinct("data.ids", {})->all;
my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache")->aggregate([
{'$unwind' => '$data.ids'},
{'$project' => { _id => 0, 'data.ids' => 1 }},
{'$group' => { _id => '$data.ids'}}
], {allowDiskUse => true})->all;
return map { $_->{_id} } @all_ids;
}
sub get_crawled_ids {
# my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->distinct("id_str", {})->all;
my @all_ids = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter")->aggregate([
{'$project' => { _id => 0, 'id_str' => 1 }},
{'$group' => { _id => '$id_str'}}
], {allowDiskUse => true})->all;
return map { $_->{_id} } @all_ids;
}
sub search_influencer {
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter");
my @result = $db->find({})->all;
my $last = time - (60 * 60 * 24 * 60);
my @pairs = map {
[$_->{id_str}, $_->{friends_count}, str2time($_->{status}->{created_at}), $_->{followers_count}, $_->{status}->{lang}, $_->{screen_name}]
} @result;
warn Dumper @pairs;
my @list = grep {
$_->[2] > $last
&& $_->[1] > 3000
&& $_->[1] < 10000
&& $_->[3] > 3000
&& $_->[4] eq "ja";
} @pairs;
@list;
}
sub search_ids {
my $res = $nt->friends_ids({screen_name => "bulkneets"});
warn Dumper $res->{ids};
return $res->{ids};
}
sub search_friends {
my $target_id = shift;
my $res = get_cache("friends_$target_id");
return $res->{ids} if $res;
$res = $nt->friends_ids({user_id => $target_id});
set_cache("friends_$target_id", $res);
sleep 5;
return $res->{ids};
}
sub get_cache {
my $key = shift;
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache");
my ($data) = $db->find({cache_key => $key})->all;
warn "get cache for $key";
if ($data) {
warn "hit cache for $key";
# warn Dumper $data;
return $data->{data};
}
}
sub set_cache {
my ($key, $data) = @_;
my $obj = {cache_key => $key, data => $data};
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter_cache");
warn "set cache for $key";
$db->replace_one({cache_key => $key}, $obj, {upsert => 1});
}
sub lookup_ids {
my $ids = shift;
my $skip_check = shift;
my $id_to_lookup = [];
for my $id (@$ids) {
if (!$skip_check) {
my $user = lookup_profile($id);
if ($user) { warn "exists $id"; next; }
}
push @{$id_to_lookup}, $id;
if (@{$id_to_lookup} >= 100) {
my $res = bulk_lookup($id_to_lookup);
save_profiles($res);
$id_to_lookup = [];
}
}
my $res = bulk_lookup($id_to_lookup);
save_profiles($res);
$id_to_lookup = [];
}
sub bulk_lookup {
sleep 1;
warn "lookup users ..";
my $id_to_lookup = shift;
if (scalar @$id_to_lookup) {
return $nt->lookup_users({user_id => $id_to_lookup });
} else {
return [];
}
}
sub lookup_profile {
my $id = shift;
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter");
my $user = $db->find({id_str => "$id"})->all;
}
sub save_profiles {
my $users = shift;
my $db = MongoDB->connect("mongodb://127.0.0.1")->ns("test.twitter");
for my $user (@$users) {
warn "insert to mongo";
# warn Dumper $user;
$db->insert_one($user);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment