Created
March 8, 2012 22:55
-
-
Save evandhoffman/2003988 to your computer and use it in GitHub Desktop.
spidercounter.pl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Logformat fed into the script is this: | |
log_format cachemiss '$remote_addr - $remote_user [$time_local] "$request" ' | |
'$status $body_bytes_sent "$http_referer" ' | |
'"$http_user_agent" "$http_x_forwarded_for" "$upstream_cache_status"'; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use Text::CSV_XS; | |
use Time::HiRes qw/gettimeofday/; | |
my @rows; | |
my $csv = Text::CSV_XS->new ( { binary => 1, sep_char => ' ' }) | |
or die "Cannot use CSV: ".Text::CSV->error_diag (); | |
my $user_agent = 9; | |
my $request = 5; | |
my $referrer = 8; | |
my $cache_status = 11; | |
my %bots = (); | |
my %cache_status = (); | |
my $line = 0; | |
my $last_time = gettimeofday(); | |
my $interval = 10000; | |
my %cache_statuses = qw/HIT 1 MISS 1 EXPIRED 1 - 1 BYPASS 1/; | |
my $engine_regex = lc(join '|', qw( Searcharoo spinn3r majestic12.co.uk ezooms Gigabot baidu Googlebot msnbot slurp|exabot pipl 80legs bingbot ahrefs yandexbot discobot pingdom websitepulse wget curl mediapartners-google sosospider sogou bingpreview), 'google web preview'); | |
$max_engine_name_length = 0; | |
print "Engine regex: $engine_regex\n"; | |
while(<>) { | |
chomp; | |
$csv->parse ($_); | |
@rows = $csv->fields(); | |
my $engine = lc(($rows[$user_agent] =~ /($engine_regex)/i)[0]); | |
$engine = "(no engine)" unless $engine; | |
if (length($engine) > $max_engine_name_length) { $max_engine_name_length = length($engine); }; | |
# my $engine = ($_ =~ /((\w+)(\s?)(bot|spider|robot))/)[0]; | |
my $cache = $rows[$cache_status]; | |
if (exists($bots{$engine})) { | |
if (exists($bots{$engine}{$cache})) { | |
$bots{$engine}{$cache} = $bots{$engine}{$cache} + 1; | |
$bots{$engine}{'count'} = $bots{$engine}{'count'} + 1; | |
} else { | |
$bots{$engine}{$cache} = 1; | |
} | |
} else { | |
$cache_statuses{$cache} = 1; # Add this to the hash if it isn't already there | |
$bots{$engine} = (); | |
foreach $s (keys %cache_statuses) { | |
$bots{$engine}{$s} = 0; | |
} | |
$bots{$engine}{'count'} = 1; | |
} | |
if (exists($cache_status{$cache})) { | |
$cache_status{$cache} = $cache_status{$cache} + 1; | |
} else { | |
$cache_status{$cache} = 1; | |
} | |
$line++; | |
if (($line % $interval) == 0) { | |
my $now = gettimeofday(); | |
my $elapsed = $now - $last_time; | |
$last_time = $now; | |
my $rate = $interval / $elapsed; | |
print "line $line, rate: $rate\n"; | |
print "------- RESULTS -----------\n"; | |
print "Total: $line\n"; | |
my $width = $max_engine_name_length + 18; | |
printf("%".$width."s","header"); | |
foreach $st (sort keys %cache_statuses) { | |
my $width = 18; | |
printf("| %".$width."s",$st); | |
} | |
print "\n"; | |
foreach $bot (sort keys %bots) { | |
my $cnt = $bots{$bot}{'count'}; | |
my $pct = $cnt * 100 / $line; | |
printf('%'.($max_engine_name_length+1)."s%8d%7.2f%% ",$bot,$cnt,$pct); | |
#foreach $c (sort keys %{$bots{$bot}}) { | |
foreach $c (sort keys %cache_statuses) { | |
# next if $c eq 'count'; | |
my $hitcnt = $bots{$bot}{$c}; | |
my $pct = $hitcnt * 100/ $cnt; | |
printf('|%8d (%6.2f%%) ',$hitcnt,$pct); | |
} | |
print "\n"; | |
} | |
print "--------- Cache Stats --------------\n"; | |
foreach $status (sort keys %cache_status) { | |
my $pct = $cache_status{$status} * 100 / $line; | |
printf("%10s\t%7d\5%8.2f%%\n",$status,$cache_status{$status},$pct); | |
} | |
} | |
} |
I know it looks beastly but this lets you get stats on cache hits/misses for each engine. Below you can see that there's an overall 75% miss rate. But if you look at there's actually a 77% hit rate, meaning actual users are benefiting from the cache even though the crawlers are still taxing the origin.
line 5320000, rate: 24517.4979935572 ------- RESULTS ----------- Total: 5320000 header| -| BYPASS| EXPIRED| HIT| MISS (no engine) 655614 12.32% | 24603 ( 3.75%) | 29 ( 0.00%) | 5835 ( 0.89%) | 554490 ( 84.58%) | 70656 ( 10.78%) 80legs 130979 2.46% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 31239 ( 23.85%) | 99739 ( 76.15%) ahrefs 23412 0.44% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 31 ( 0.13%) | 23380 ( 99.86%) baidu 292963 5.51% | 0 ( 0.00%) | 0 ( 0.00%) | 19 ( 0.01%) | 449 ( 0.15%) | 292494 ( 99.84%) bingbot 96163 1.81% | 9 ( 0.01%) | 0 ( 0.00%) | 0 ( 0.00%) | 1729 ( 1.80%) | 94424 ( 98.19%) bingpreview 22901 0.43% | 1 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 18461 ( 80.61%) | 4438 ( 19.38%) curl 13 0.00% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 8 ( 61.54%) | 4 ( 30.77%) discobot 162 0.00% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 161 ( 99.38%) exabot 3664 0.07% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 10 ( 0.27%) | 3653 ( 99.70%) ezooms 1351 0.03% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 33 ( 2.44%) | 1317 ( 97.48%) gigabot 3 0.00% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 2 ( 66.67%) google web preview 10676 0.20% | 0 ( 0.00%) | 0 ( 0.00%) | 8 ( 0.07%) | 3216 ( 30.12%) | 7451 ( 69.79%) googlebot 1587595 29.84% | 0 ( 0.00%) | 0 ( 0.00%) | 10 ( 0.00%) | 1071 ( 0.07%) | 1586513 ( 99.93%) majestic12.co.uk 1179 0.02% | 3 ( 0.25%) | 0 ( 0.00%) | 0 ( 0.00%) | 52 ( 4.41%) | 1123 ( 95.25%) mediapartners-google 23915 0.45% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 12719 ( 53.18%) | 11195 ( 46.81%) msnbot 2341213 44.01% | 0 ( 0.00%) | 0 ( 0.00%) | 2 ( 0.00%) | 1410 ( 0.06%) | 2339800 ( 99.94%) pingdom 1437 0.03% | 0 ( 0.00%) | 0 ( 0.00%) | 1 ( 0.07%) | 1435 ( 99.86%) | 0 ( 0.00%) pipl 40437 0.76% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 458 ( 1.13%) | 39978 ( 98.86%) searcharoo 39 0.00% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 2 ( 5.13%) | 36 ( 92.31%) slurp 2376 0.04% | 0 ( 0.00%) | 0 ( 0.00%) | 77 ( 3.24%) | 2081 ( 87.58%) | 217 ( 9.13%) sogou 7479 0.14% | 0 ( 0.00%) | 0 ( 0.00%) | 2 ( 0.03%) | 23 ( 0.31%) | 7453 ( 99.65%) sosospider 1091 0.02% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 36 ( 3.30%) | 1054 ( 96.61%) spinn3r 94 0.00% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 47 ( 50.00%) | 46 ( 48.94%) websitepulse 3841 0.07% | 0 ( 0.00%) | 0 ( 0.00%) | 4 ( 0.10%) | 3836 ( 99.87%) | 0 ( 0.00%) wget 56497 1.06% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 311 ( 0.55%) | 56185 ( 99.45%) yandexbot 14906 0.28% | 0 ( 0.00%) | 0 ( 0.00%) | 0 ( 0.00%) | 57 ( 0.38%) | 14848 ( 99.61%) --------- Cache Stats -------------- - 24616 0.46% BYPASS 29 0.00% EXPIRED 5958 0.11% HIT 633208 11.90% MISS 4656189 87.52%
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
... will show running totals of spiders as the data appear.