Skip to content

Instantly share code, notes, and snippets.

@evandhoffman
Created March 8, 2012 22:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save evandhoffman/2003988 to your computer and use it in GitHub Desktop.
Save evandhoffman/2003988 to your computer and use it in GitHub Desktop.
spidercounter.pl
# Logformat fed into the script is this:
log_format cachemiss '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" "$upstream_cache_status"';
#!/usr/bin/perl
use Text::CSV_XS;
use Time::HiRes qw/gettimeofday/;
my @rows;
my $csv = Text::CSV_XS->new ( { binary => 1, sep_char => ' ' })
or die "Cannot use CSV: ".Text::CSV->error_diag ();
my $user_agent = 9;
my $request = 5;
my $referrer = 8;
my $cache_status = 11;
my %bots = ();
my %cache_status = ();
my $line = 0;
my $last_time = gettimeofday();
my $interval = 10000;
my %cache_statuses = qw/HIT 1 MISS 1 EXPIRED 1 - 1 BYPASS 1/;
my $engine_regex = lc(join '|', qw( Searcharoo spinn3r majestic12.co.uk ezooms Gigabot baidu Googlebot msnbot slurp|exabot pipl 80legs bingbot ahrefs yandexbot discobot pingdom websitepulse wget curl mediapartners-google sosospider sogou bingpreview), 'google web preview');
$max_engine_name_length = 0;
print "Engine regex: $engine_regex\n";
while(<>) {
chomp;
$csv->parse ($_);
@rows = $csv->fields();
my $engine = lc(($rows[$user_agent] =~ /($engine_regex)/i)[0]);
$engine = "(no engine)" unless $engine;
if (length($engine) > $max_engine_name_length) { $max_engine_name_length = length($engine); };
# my $engine = ($_ =~ /((\w+)(\s?)(bot|spider|robot))/)[0];
my $cache = $rows[$cache_status];
if (exists($bots{$engine})) {
if (exists($bots{$engine}{$cache})) {
$bots{$engine}{$cache} = $bots{$engine}{$cache} + 1;
$bots{$engine}{'count'} = $bots{$engine}{'count'} + 1;
} else {
$bots{$engine}{$cache} = 1;
}
} else {
$cache_statuses{$cache} = 1; # Add this to the hash if it isn't already there
$bots{$engine} = ();
foreach $s (keys %cache_statuses) {
$bots{$engine}{$s} = 0;
}
$bots{$engine}{'count'} = 1;
}
if (exists($cache_status{$cache})) {
$cache_status{$cache} = $cache_status{$cache} + 1;
} else {
$cache_status{$cache} = 1;
}
$line++;
if (($line % $interval) == 0) {
my $now = gettimeofday();
my $elapsed = $now - $last_time;
$last_time = $now;
my $rate = $interval / $elapsed;
print "line $line, rate: $rate\n";
print "------- RESULTS -----------\n";
print "Total: $line\n";
my $width = $max_engine_name_length + 18;
printf("%".$width."s","header");
foreach $st (sort keys %cache_statuses) {
my $width = 18;
printf("| %".$width."s",$st);
}
print "\n";
foreach $bot (sort keys %bots) {
my $cnt = $bots{$bot}{'count'};
my $pct = $cnt * 100 / $line;
printf('%'.($max_engine_name_length+1)."s%8d%7.2f%% ",$bot,$cnt,$pct);
#foreach $c (sort keys %{$bots{$bot}}) {
foreach $c (sort keys %cache_statuses) {
# next if $c eq 'count';
my $hitcnt = $bots{$bot}{$c};
my $pct = $hitcnt * 100/ $cnt;
printf('|%8d (%6.2f%%) ',$hitcnt,$pct);
}
print "\n";
}
print "--------- Cache Stats --------------\n";
foreach $status (sort keys %cache_status) {
my $pct = $cache_status{$status} * 100 / $line;
printf("%10s\t%7d\5%8.2f%%\n",$status,$cache_status{$status},$pct);
}
}
}
@evandhoffman
Copy link
Author

tail -f /var/log/nginx/access_log | perl spidercount.pl

... will show running totals of spiders as the data appear.

@evandhoffman
Copy link
Author

I know it looks beastly but this lets you get stats on cache hits/misses for each engine. Below you can see that there's an overall 75% miss rate. But if you look at there's actually a 77% hit rate, meaning actual users are benefiting from the cache even though the crawlers are still taxing the origin.

line 5320000, rate: 24517.4979935572
------- RESULTS -----------
Total: 5320000
                                header|                  -|             BYPASS|            EXPIRED|                HIT|               MISS
          (no engine)  655614  12.32% |   24603 (  3.75%) |      29 (  0.00%) |    5835 (  0.89%) |  554490 ( 84.58%) |   70656 ( 10.78%) 
               80legs  130979   2.46% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |   31239 ( 23.85%) |   99739 ( 76.15%) 
               ahrefs   23412   0.44% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      31 (  0.13%) |   23380 ( 99.86%) 
                baidu  292963   5.51% |       0 (  0.00%) |       0 (  0.00%) |      19 (  0.01%) |     449 (  0.15%) |  292494 ( 99.84%) 
              bingbot   96163   1.81% |       9 (  0.01%) |       0 (  0.00%) |       0 (  0.00%) |    1729 (  1.80%) |   94424 ( 98.19%) 
          bingpreview   22901   0.43% |       1 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |   18461 ( 80.61%) |    4438 ( 19.38%) 
                 curl      13   0.00% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |       8 ( 61.54%) |       4 ( 30.77%) 
             discobot     162   0.00% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |     161 ( 99.38%) 
               exabot    3664   0.07% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      10 (  0.27%) |    3653 ( 99.70%) 
               ezooms    1351   0.03% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      33 (  2.44%) |    1317 ( 97.48%) 
              gigabot       3   0.00% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |       2 ( 66.67%) 
   google web preview   10676   0.20% |       0 (  0.00%) |       0 (  0.00%) |       8 (  0.07%) |    3216 ( 30.12%) |    7451 ( 69.79%) 
            googlebot 1587595  29.84% |       0 (  0.00%) |       0 (  0.00%) |      10 (  0.00%) |    1071 (  0.07%) | 1586513 ( 99.93%) 
     majestic12.co.uk    1179   0.02% |       3 (  0.25%) |       0 (  0.00%) |       0 (  0.00%) |      52 (  4.41%) |    1123 ( 95.25%) 
 mediapartners-google   23915   0.45% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |   12719 ( 53.18%) |   11195 ( 46.81%) 
               msnbot 2341213  44.01% |       0 (  0.00%) |       0 (  0.00%) |       2 (  0.00%) |    1410 (  0.06%) | 2339800 ( 99.94%) 
              pingdom    1437   0.03% |       0 (  0.00%) |       0 (  0.00%) |       1 (  0.07%) |    1435 ( 99.86%) |       0 (  0.00%) 
                 pipl   40437   0.76% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |     458 (  1.13%) |   39978 ( 98.86%) 
           searcharoo      39   0.00% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |       2 (  5.13%) |      36 ( 92.31%) 
                slurp    2376   0.04% |       0 (  0.00%) |       0 (  0.00%) |      77 (  3.24%) |    2081 ( 87.58%) |     217 (  9.13%) 
                sogou    7479   0.14% |       0 (  0.00%) |       0 (  0.00%) |       2 (  0.03%) |      23 (  0.31%) |    7453 ( 99.65%) 
           sosospider    1091   0.02% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      36 (  3.30%) |    1054 ( 96.61%) 
              spinn3r      94   0.00% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      47 ( 50.00%) |      46 ( 48.94%) 
         websitepulse    3841   0.07% |       0 (  0.00%) |       0 (  0.00%) |       4 (  0.10%) |    3836 ( 99.87%) |       0 (  0.00%) 
                 wget   56497   1.06% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |     311 (  0.55%) |   56185 ( 99.45%) 
            yandexbot   14906   0.28% |       0 (  0.00%) |       0 (  0.00%) |       0 (  0.00%) |      57 (  0.38%) |   14848 ( 99.61%) 
--------- Cache Stats --------------
         -        24616    0.46%
    BYPASS           29    0.00%
   EXPIRED         5958    0.11%
       HIT       633208   11.90%
      MISS      4656189   87.52%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment