Created
June 3, 2010 02:20
-
-
Save peterkeen/423346 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use Try::Tiny; | |
use IO::File; | |
use JSON::XS qw/ decode_json /; | |
use File::Slurp qw/ read_file /; | |
use constant READ_TIMEOUT_SECONDS => 5; | |
# see http://apiwiki.twitter.com/Streaming-API-Documentation | |
my $capture_hours = shift; | |
my $max_mb = shift; | |
my $output_file = shift; | |
my $keywords = shift; | |
my $capture_seconds = $capture_hours * 3600; | |
my $end_time = time() + $capture_seconds; | |
my $max_num_rounds = int(($max_mb * 1024 * 1024) / 4096); | |
my $current_round = 0; | |
my $output_fh = IO::File->new($output_file, 'a+') or die "Cannot open $output_file for append: $!"; | |
my $config_contents = read_file("$ENV{HOME}/.twpass"); | |
my $config = decode_json($config_contents); | |
my $username = $config->{default_username}; | |
my $password = $config->{passwords}->{$username}; | |
print STDERR "\nstarted at " . time() . " ending at $end_time, max rounds $max_num_rounds, tracking keywords $keywords\n"; | |
my $command = qq{curl -q http://stream.twitter.com/1/statuses/filter.json\?track=$keywords -u$username:$password 2>/dev/null}; | |
sub capture_sample | |
{ | |
my $fh = shift; | |
my $buf; | |
eval { | |
local $SIG{ALRM} = sub { die "read timeout at " . time() . "\n" }; | |
alarm(READ_TIMEOUT_SECONDS); | |
sysread($fh, $buf, 4096); | |
alarm 0; | |
}; | |
if ($@) { | |
die $@; | |
} | |
return $buf; | |
} | |
sub capture_twitter_feed | |
{ | |
open (my $fh, "$command|"); | |
while(my $buf = capture_sample($fh)) | |
{ | |
$output_fh->print($buf); | |
last if ($end_time < time() || $current_round++ > $max_num_rounds) | |
} | |
close $fh; | |
} | |
while (1) { | |
try { | |
capture_twitter_feed(); | |
print STDERR "finished\n"; | |
exit 0; | |
} catch { | |
if ($_ =~ "read timeout") { | |
print STDERR "read timeout. Retrying...\n"; | |
} else { | |
die $_; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use JSON::XS qw/ decode_json /; | |
use Try::Tiny; | |
$/ = "\r"; | |
binmode(STDIN, ':utf8'); | |
binmode(STDOUT, ':utf8'); | |
while(<>) { | |
my $obj; | |
try { | |
$obj = decode_json($_); | |
} catch { }; | |
next unless $obj; | |
my $text = $obj->{text}; | |
next unless $text; | |
$text =~ s/[^\w\d#\s]//g; | |
my @w = split(/\s+/, lc $text); | |
for my $i ( 0 .. $#w ) { | |
print_if_all(1, $w[$i]); | |
print_if_all(2, @w[$i..$i+1]); | |
print_if_all(3, @w[$i..$i+2]); | |
} | |
} | |
sub print_if_all | |
{ | |
my $n = shift; | |
@_ = grep { $_ } @_; | |
print join(' ', @_) . "\t1\n" if @_ == $n; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
my %sum; | |
binmode(STDIN, ':utf8'); | |
binmode(STDOUT, ':utf8'); | |
while(<>) { | |
chomp; | |
my ($key, $num) = split(/\t/, $_); | |
$sum{$key} += $num; | |
} | |
print join("\t", $_, $sum{$_}) . "\n" for keys %sum; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment