June 1 2001 - After getting comfortable with perl by mid-2001 i tried my hand at some CGI programming (didn't someone say Perl was the glue of the web?).. This utility basically aggregates statistics generated by http-analyze in one nice central place. This was part of some work i was doing at an internet provider that did among other things, a …
#!/usr/bin/perl -w
# written from scratch by Alex Koralewski
# time spent (so far): approximately 12 Hours (whew!)
# this program collects domain statistics created by http-analyze
# and summaries them into one pretty page.
use CGI qw/:standard/;
use CGI::Carp qw/fatalsToBrowser/;
use Time::HiRes qw/gettimeofday tv_interval/;
## use POSIX qw(strftime);
# new instance of the CGI class
my $query = new CGI;
# initialize date/time parameters and defaults #
# set up "firsts" parameters for dates
my $first_year = 99;
my $first_month = 9;
my $first_day = 1;
# calculate current date/time
my @now = localtime(time);
my $curr_year = $now[5];
my $curr_month = $now[4];
my $curr_day = $now[3];
my $now_string = localtime(time);
## $now_string = strftime "%a %b %e %H:%M:%S %Y", localtime;
# initialize default parameters if not set #
my $target_mode; my $target_year; my $target_month;
my $target_day; my $target_top_num;
if ($query->param()) {
$target_mode = $query->param('target_mode'); $target_year = $query->param('target_year');
$target_month = $query->param('target_month'); $target_day = $query->param('target_day');
$target_top_num = $query->param('target_top_num');
} else {
$target_mode = 0; $target_year = $curr_year;
$target_month = $curr_month; $target_day = $curr_day;
$target_top_num = 10;
if (!defined($target_day)) { $target_day = 1; }
# what we're looking for in the stats pages
my $target = "Total";
if ($target_mode) {
$target = "^<TR.*><TD ALIGN.+><B>$target_day</B>.+\$";
# save year as nnnn format, and month as nn format;
my $the_year = 1900 + $target_year;
my $the_month = sprintf("%.2u",($target_month + 1));
# set up month-processing data
my %month_names = (0=>"January", 1=>"February", 2=>"March", 3=>"April",
4=>"May", 5=>"June", 6=>"July", 7=>"August",
8=>"September", 9=>"October", 10=>"November", 11=>"December");
@month_days = (31,28,31,30,31,30,31,31,30,31,30,31);
my $target_month_name = $month_names{$target_month};
$target_num_days = $month_days[$target_month];
# establish parameter ranges with hashes #
my %target_mode_hash; my %target_year_hash; my %target_month_hash;
my %target_day_hash; my %target_top_num_hash;
%target_mode_hash = (0=>"Monthly", 1=>"Daily");
my @target_year_array;
foreach $this_year ($first_year .. $curr_year) {
push(@target_year_array, $this_year); push(@target_year_array, ($this_year + 1900));
%target_year_hash = @target_year_array; undef(@target_year_array);
my $from_month = 0, my $to_month = 11;
if ($target_year == $first_year) { $from_month = $first_month; }
if ($target_year == $curr_year) { $to_month = $curr_month; }
my @target_month_array;
foreach $this_month ($from_month .. $to_month) {
push(@target_month_array, $this_month); push(@target_month_array, $month_names{$this_month});
%target_month_hash = @target_month_array;
undef(@target_month_array); undef($from_month); undef($to_month);
my $from_day = 1; my $to_day = $month_days[$target_month];
if ($target_month == $first_month && $target_year == $first_year) { $from_day = $first_day; }
if ($target_month == $curr_month && $target_year == $curr_year) { $to_day = $curr_day; }
my @target_day_array;
foreach $this_day ($from_day .. $to_day) {
push(@target_day_array, $this_day); push(@target_day_array, $this_day);
%target_day_hash = @target_day_array;
undef(@target_day_array); undef($from_day); undef($to_day);
%target_top_num_hash = (5=>"Top 5", 10=>"Top 10",
20=>"Top 20", 50=>"Top 50",
100=>"Top 100", 9999=>"All");
# establish parsing and traversing parameters #
# which line we'll want after our target is found
my $pos_pageviews = 3;
# directory where domains' files reside
my $dir = "/usr/local/etc/httpd/sites";
# file containing stats with data at time requested
my $file = "htdocs/stats/www${the_year}/days${the_month}".
sprintf("%.2u",($target_year % 100)). ".html";
# what the titles will be depend on the mode we using
my $title_string = "$month_names{$target_month} $the_year";
if ($target_mode) { $title_string = "$month_names{$target_month} $target_day, $the_year"; }
$title_string = "Domain Statistics Summary for $target_top_num_hash{$target_top_num} of $title_string";
# other declarations
my @pageviews; # will be array (of arrays) to contain the pageviews information
my $sum = 0; # will hold the total number of pageviews found
my $total_domains = 0; # will hold the total number of domains found
my $processed_domains = 0; # will hold the number of processed domains
my $untouched_domains; # will hold the difference of the above two
my $line; # will hold the current read line
my $i; # will be a simple counter
# set up page and such.. and print out the results..
print $query->header,
-author=>'Alex Koralewski',
print <<END_HEAD;
<table width="50%" border="2" cellspacing="1" cellpadding="1">
<td height="24" bgcolor="#CCCCCC" align="center"><font color="#000000"><b>$title_string</b></font></td>
print "<font size=\"-1\">Now Processing Request<blink><b>...</b></blink></font><br>\n";
# set up a before-time..
my $time0 = [gettimeofday];
chdir($dir); # to the directory we go.
@dirs = glob("*.*"); # get list of *.* dirs
foreach $current_domain (@dirs) { # cycle through each directory
open(STATS,"<$current_domain/$file") # open stats file in this dir
|| # otherwise..
next; # close file and skip rest
while (<STATS>) { # start processing file..
$line = $_; # and reading lines..
if ($line =~ m/$target/) { last; } # until we found our target
$i = $pos_pageviews; # set up counter to skip
while($i) { $line = <STATS>; $i--; }; # certain amount of lines
close(STATS); # then close file we opened
$line =~ m#^<TD.+<B>(.+)</B>.+</TD>$#; # parse html via regex
$total_pageviews = $1; # retrieve the proper term
$sum += $total_pageviews; # add to sum these pageviews
if ($total_pageviews > 0) { $processed_domains++; }
push(@pageviews, [ $current_domain, $total_pageviews ]); # and save into the array
@pageviews = reverse sort { $a->[1] <=> $b->[1] } @pageviews; # now sort the array
# calculate processing time..
$elapsed_time = sprintf("%.4f",(tv_interval ($time0)));
# calculate the domains not touched..
$untouched_domains = $total_domains - $processed_domains;
$processed_domains_pcntg = sprintf("%.2f",(($processed_domains / $total_domains)*100));
$untouched_domains_pcntg = sprintf("%.2f",(($untouched_domains / $total_domains)*100));
# format this big integer nicely with comma's
$sum = &commify($sum);
# initialize counter
$i = 1;
# set up page and such.. and print out the results..
print <<END_INTRO;
<font size="+1"><b>done!</b></font>
<table width="45%" border="2" cellspacing="1" cellpadding="1">
<td align="center" colspan="2" bgcolor="#00CC00"><font color="#FFFFFF"><b>Totals</font></b></td>
<td align="left">Total Domains:</td>
<td align="center"><b>$total_domains</b></td>
<td align="left">Domains Shown:</td>
<td align="center"><b>$target_top_num_hash{$target_top_num}</b></td>
<td align="left">Domains Processed:</td>
<td align="center"><b>$processed_domains</b> ($processed_domains_pcntg %)</td>
<td align="left">Domains Unaccounted For:</td>
<td align="center"><b>$untouched_domains</b> ($untouched_domains_pcntg %)</td>
<td align="left">Total Pageviews:</td>
<td align="center"><b>$sum</b></td>
<td align="left">Processing Time:</b></td>
<td align="center"><b>&nbsp;$elapsed_time seconds&nbsp;</td>
<table width="45%" border="2" cellspacing="1" cellpadding="1">
<td align="center" bgcolor="#0000FF"><b><font color="#FFFFFF">&nbsp;Rank&nbsp;</font></b></td>
<td align="center" bgcolor="#9900FF"><b><font color="#FFFFFF">&nbsp;Domain&nbsp;</font></b></td>
<td align="center" bgcolor="#FF0000"><b><font color="#FFFFFF">&nbsp;Pageviews&nbsp;</font></b></td>
# cycle through each array in the array @pageviews and process/print out each
foreach $listing (@pageviews) {
print "<tr>\n";
print " <td align=\"center\">$i.</td>\n";
print " <td align=\"center\"><a href=\"http://$listing->[0]/stats/www${the_year}/days${the_month}".
sprintf("%.2u",($target_year % 100)).
print " <td align=\"center\"><b>".&commify($listing->[1])."</b></td>\n";
print "</tr>\n";
if (++$i > $target_top_num) { last; }
# print out the footers and finish..
print "</table>\n</p>\n";
# &print_choice_menu;
print <<END_OUTRO;
<table border="3" width="75%" cellpadding="0" cellspacing="0">
<td><table width="100%" cellpadding="0" cellspacing="0">
<td nowrap align="left"><a href="/domstatsum2.txt">Domain Statistics Summary</a></td>
<td nowrap align="center">Copyright &#169; 2000 by <a href="mailto:cynikal\ Statistics Summary">Alex Koralewski</a></td>
<td nowrap align="right">$now_string</td>
sub print_choice_menu {
print $query->startform(-method=>'post',
if ($target_mode) {
print $query->table({-border=>undef,-width=>'35%',-cellspacing=>1,-cellpadding=>1},
td({-align=>"center"} ,[ 'Mode' ,
$query->popup_menu(-name=>'target_mode',-values=>[ keys(%target_mode_hash) ],
td({-align=>"center"} ,[ 'Year' ,
$query->popup_menu(-name=>'target_year',-values=>[ keys(%target_year_hash) ],
td({-align=>"center"} ,[ 'Month' ,
sort { $a <=> $b } (keys(%target_month_hash)) ],
td({-align=>"center"} ,[ 'Day' ,
sort { $a <=> $b } (keys(%target_day_hash)) ],
td({-align=>"center"} ,[ 'Amount' ,
sort { $a <=> $b } (keys(%target_top_num_hash)) ],
} else {
print $query->table({-border=>undef,-width=>'35%',-cellspacing=>1,-cellpadding=>1},
td({-align=>"center"} ,[ 'Mode' ,
$query->popup_menu(-name=>'target_mode',-values=>[ keys(%target_mode_hash) ],
td({-align=>"center"} ,[ 'Year' ,
sort { $a <=> $b } (keys(%target_year_hash)) ],
td({-align=>"center"} ,[ 'Month' ,
sort { $a <=> $b } (keys(%target_month_hash)) ],
td({-align=>"center"} ,[ 'Amount' ,
sort { $a <=> $b } (keys(%target_top_num_hash)) ],
print $query->submit(-name=>'Re-request Report');
print $query->end_form;
# from the perldoc perlfaq5 question #10..
sub commify { local $_ = shift; 1 while s/^([-+]?\d+)(\d{3})/$1,$2/; return $_; }
