Skip to content

Instantly share code, notes, and snippets.

@markcode
Created February 9, 2012 07:19
Show Gist options
  • Save markcode/1778074 to your computer and use it in GitHub Desktop.
Save markcode/1778074 to your computer and use it in GitHub Desktop.
Jot - Perl pixel server stored to Scribe
package Analumic::Jotr;
# *
# * Analumic - Analytics Statistics Logger "Jot".
# * Perl (mod_perl module) pixel server stored to Scribe and enhanced by GeoIP.
# * See web server config info (web_server_config.conf).
# * http://analumic.com
# *
use strict;
#these modules are now loaded in apache config
# use Apache2::RequestRec ();
# use Apache2::RequestIO ();
use Apache2::Const -compile => qw(OK);
# use Apache2::RequestUtil; #YAAH!!! this passes all the cgi environment headers
# use APR::Table;
use POSIX qw(strftime);
use URI::Escape;
use Digest::MD5 qw(md5_hex);
use Data::Validate::URI qw(is_uri);
#these modules are now loaded in apache config
# use Scribe::Thrift::scribe;
# use Thrift::Socket;
# use Thrift::FramedTransport;
# use Thrift::BinaryProtocol;
# TO DO: what about tainted inputs like dangerous ua or referals etc, need a way to clean them!???
#use Data::Dumper;
# --------------------------------------------------- SUBS
my $r; $r = '';
sub handler {
$r = shift;
#$r->content_type('text/plain');
#my $rh = Apache2::RequestUtil->request();
#$rh->content_type('text/plain');
#print Dumper(\%ENV);
run_jotr();
}
sub display_pic {
$r->content_type('image/gif');
$r->headers_out->set("Expires" => "Mon, 26 Jul 2005 05:00:00 GMT");
$r->headers_out->set("Cache-Control" => "no-store, no-cache, must-revalidate");
$r->headers_out->set("Pragma" => "no-cache");
if ( $ENV{'DNT'} eq '1' ) {
$r->headers_out->set("DNT" => "1");
}
$r->headers_out->set("Content-Length" => "43");
printf "GIF89a\1\0\1\0%c\0\0%c%c%c\0\0\0%s,\0\0\0\0\1\0\1\0\0%c%c%c\1\0;",
144,255,0,0,1?pack("C8",33,249,4,5,16,0,0,0):"",2,2,4;
+0;
return Apache2::Const::OK;
}
sub trim {
my $string;
$string = $_[0];
$string =~ s/^\s+//;
$string =~ s/\s+$//;
return $string;
}
sub quote_escape {
my $string;
$string = $_[0];
$string =~ s/"/\\"/g; #replace " with \" cause " is the log format delimiator
return $string;
}
sub truncate_ip {
my $ip;
$ip = $_[0];
if ( index($ip, '.') != -1 ) {
#ipv4 or mapped to ipv6
return substr($ip, 0, rindex($ip, '.')) . '.0';
} else {
#ipv6
return substr($ip, 0, rindex($ip, ':')) . ':0';
}
}
sub request_IP {
if ( $ENV{'GEOIP_ADDR'} ne '' ) { return $ENV{'GEOIP_ADDR'}; }
if ( $ENV{'HTTP_X_FORWARDED_FOR'} ne '' ) { return $ENV{'HTTP_X_FORWARDED_FOR'}; }
if ( $ENV{'REMOTE_ADDR'} ne '' ) { return $ENV{'REMOTE_ADDR'}; }
if ( $ENV{'HTTP_CLIENT_IP'} ne '' ) { return $ENV{'HTTP_CLIENT_IP'}; }
display_pic(); #else did fail
return Apache2::Const::OK;
}
sub clean_url {
my $url;
my $url = $_[0];
my $https = 0;
if ( index($url, '/', 9) == -1 ) { $url = $url . '/'; } #make sure domain has trailing /
if (substr($url, 0, 8) eq 'https://') {
$https = 1;
}
#make sure domains http:// is not listed within the string so mysql can read first domain in string.
$url =~ s/http\:\/\///gi;
$url =~ s/https\:\/\///gi;
$url = substr($url, 0, 255); #max 255 chars
if ( $https == 1 ) {
return 'https://'.$url;
} else {
return 'http://'.$url;
}
}
sub check_url {
my $url;
my $url = $_[0];
if(is_uri($url)){
return 1;
} else {
return 0;
}
}
sub md5_to_int {
my $md5;
$md5 = $_[0];
my $md5_int = '';
my $i = 0;
my $md5_val; $md5_val = '';
my @characters = ("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","!");
for ($i=0; $i<=32; $i++) {
$md5_val = substr($md5, $i, 1);
if ( length($md5_int) == 18 ) {
#cant have 0 as first cause mysql will trim length to 17 chars.
if ( substr($md5_int, 0, 1) eq '0' ) {
$md5_int = '9'.substr($md5_int, 1, length($md5_int)); #replace 0 with 9
}
return $md5_int;
}
if ($md5_val =~ /^\d/) { # is a number
$md5_int .= $md5_val;
next;
}
my ($k, $r, $i);
for ($k = 0; $k < @characters; $k++) {
if ( $md5_val eq $characters[$k] ) {
$r = $k + 1;
$md5_int .= $r;
last;
}
}
}
if ( length($md5_int) < 18 ) {
my $len; $len = 0;
$len = 18 - length($md5_int);
my $roundn = ''; my $n;
for ($n = 0; $n < $len; $n++) {
$roundn .= '0';
}
$md5_int = $md5_int . $roundn;
}
@characters = ();
return substr($md5_int,0,9); #32 bit
#return substr($md5_int,0,18); #64 bit
}
sub error_log {
my $bulog; my $aid;
$aid = $_[0];
$bulog = $_[1];
use Apache2::Log ();
Apache2::ServerRec::warn("[jot] [${aid}] ${bulog}");
#writes to /var/log/apache2/error.log format (search for "[jot]") EG:
#[Mon Apr 04 08:31:20 2011] [warn] [jot] 2011-04-04 07:31:20 229254151336211252 JP 693a36b5b8edb232cb38edd5fa2ef844 "219.118.26.0" "Koshigaya" "Saitama" "AS" "35.883301 139.783295" "en" "http://blog.barcodestore.nl/" "http://www.moretechtips.net/2009/09/realtime-related-tweets-bar-another.html" "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7"
display_pic();
return Apache2::Const::OK;
}
sub run_jotr() {
# Do Not Track
if ( $ENV{'DNT'} eq '1' ) {
display_pic(); return Apache2::Const::OK;
}
#referal
my $referal_url = ''; #reset
$referal_url = $ENV{'HTTP_REFERER'};
if ( $referal_url eq '' ) { display_pic(); return Apache2::Const::OK; }
if ( substr($referal_url, 0, 4) ne 'http' ) { display_pic(); return Apache2::Const::OK; }
#$referal_url = uri_unescape($referal_url);
$referal_url = clean_url($referal_url);
#$referal_url = quote_escape($referal_url);
#print "referal_url: ${referal_url}\n";
#referrer (js generated)
my $referrer_url = ''; #reset
$referrer_url = $ENV{'QUERY_STRING'};
my $pos;
$pos = '';
$pos = index($referrer_url, 'ref=');
if ( $pos > -1 ) {
$referrer_url = substr($referrer_url, $pos + 4);
$referrer_url = uri_unescape($referrer_url);
#was stripping out search term so skip trim url
#my $pos2;
#$pos2 = '';
#$pos2 = index($referrer_url, '&');
#if ( $pos2 > -1 ) {
# $referrer_url = substr($referrer_url, 0, $pos2);
#}
$referrer_url = clean_url($referrer_url);
if ( check_url($referrer_url) == 0 ) { display_pic(); return Apache2::Const::OK; }
$referrer_url = substr($referrer_url, 0, 255); #max 255 chars
#$referrer_url = quote_escape($referrer_url);
#print "referrer_url: ${referrer_url}\n";
}
#valide urls are good format
if ( check_url($referal_url) == 0 ) { display_pic(); return Apache2::Const::OK; }
$referal_url = substr($referal_url, 0, 255); #max 255 chars
#user agent
my $referal_ua = trim($ENV{'HTTP_USER_AGENT'});
if ( $referal_ua eq '' ) { display_pic(); return Apache2::Const::OK; }
$referal_ua = quote_escape($referal_ua);
$referal_ua = substr($referal_ua, 0, 512); #max 512 chars
#browser language if set (code, will convert to proper name in post-processing).
my $referal_lng = trim($ENV{'HTTP_ACCEPT_LANGUAGE'});
if ( $referal_lng ne '' ) {
$pos = 0;
$pos = index($referal_lng, ',');
if ($pos > -1) {
$referal_lng = substr($referal_lng, 0, $pos);
}
$referal_lng = lc $referal_lng;
$referal_lng = substr($referal_lng, 0, 7);
$referal_lng = quote_escape($referal_lng);
}
#print "referal_lng: ${referal_lng}\n";
#
my $cat = 'foo'; # scribe category
my $ip = request_IP();
#print "ip: ${ip}\n";
my $referal_3166_1 = $ENV{'GEOIP_COUNTRY_CODE'};
if ( $referal_3166_1 eq '' ) { display_pic(); return Apache2::Const::OK; }
if ( $referal_3166_1 eq 'A1' ) { display_pic(); return Apache2::Const::OK; } #"Anonymous Proxy"
if ( $referal_3166_1 eq 'A2' ) { display_pic(); return Apache2::Const::OK; } #"Satellite Provider"
if ( $referal_3166_1 eq 'O1' ) { display_pic(); return Apache2::Const::OK; } #"Other Country"
if ( $referal_3166_1 eq 'EU' ) { display_pic(); return Apache2::Const::OK; } #"Europe" country yet to be given an ISO 3166
if ( $referal_3166_1 eq 'AP' ) { display_pic(); return Apache2::Const::OK; } #"Asia/Pacific Region" country yet to be given an ISO 3166
my $referal_city = $ENV{'GEOIP_CITY'};
my $referal_region = $ENV{'GEOIP_REGION_NAME'};
my $referal_contnt = $ENV{'GEOIP_CONTINENT_CODE'};
my $referal_coords = $ENV{'GEOIP_LATITUDE'} . ' ' . $ENV{'GEOIP_LONGITUDE'};
my $referal_ip_md5 = md5_hex($ip);
my $ip_tunc = truncate_ip($ip);
#print "ip_tunc: ${ip_tunc}\n";
#2011-03-25 01:02:53
my $gmt_dt = strftime "%Y-%m-%d %H:%M:%S", gmtime;
#print "gmt_dt: ${gmt_dt}\n";
my $gmt_toH_dt = strftime "%Y-%m-%d %H", gmtime; #0000-00-00 00 (hourly for poll)
my $md5 = md5_hex($gmt_toH_dt.$referal_url.$ip.$referal_ua); #added ua cause networks could share same ip
my $uid = md5_to_int($md5);
#print "uid: ${uid}\n";
if ( $referal_lng eq '' ) { $referal_lng = '-'; }
if ( $referal_city eq '' ) { $referal_city = '-'; }
if ( $referal_region eq '' ) { $referal_region = '-'; }
if ( $referal_contnt eq '' ) { $referal_contnt = '-'; }
if ( $referal_coords eq '' ) { $referal_coords = '-'; }
if ( $referal_coords eq ' ' ) { $referal_coords = '-'; }
if ( $referrer_url eq '' ) { $referrer_url = '-'; }
# --------------------------------------------------- LOG
my $log;
eval{
$log = '';
$log = "$gmt_dt $uid $referal_3166_1 $referal_ip_md5 \"$ip_tunc\" \"$referal_city\" \"$referal_region\" \"$referal_contnt\" \"$referal_coords\" \"$referal_lng\" \"$referal_url\" \"$referrer_url\" \"$referal_ua\"";
#print "log: ${log}\n";
my $host = 'localhost';
my $port = 1463;
my $socket = Thrift::Socket->new($host, $port);
my $transport = Thrift::FramedTransport->new($socket);
my $proto = Thrift::BinaryProtocol->new($transport);
my $client = Scribe::Thrift::scribeClient->new($proto, $proto);
$transport->open();
#log message to cat
my $le = Scribe::Thrift::LogEntry->new({ category => $cat });
$le->message($log);
$result = -1;
$result = $client->Log([ $le ]);
#print Dumper($result);
if ( $result != 0 ) {
error_log($cat, $log); #err log to error log
}
#these return generate perl error
#if ($result == Scribe::Thrift::ResultCode::TRY_LATER) {
# print STDERR "TRY_LATER\n";
#} elsif ($result != Scribe::Thrift::ResultCode::OK) {
# print STDERR "Unknown result code: $result\n";
#}
#close scribe
$transport->close();
} or do {
### catch block
#print "FAILED to connect";
error_log($cat, $log);
};
display_pic();
return Apache2::Const::OK;
}
1;
# web server config in eg: /etc/apache2/sites-available/default
<IfModule mod_perl.c>
<Files ~ "\.gif$">
SetHandler perl-script
PerlHandler Analumic::Jotr
PerlSendHeader On
</Files>
PerlSwitches -wT
PerlModule Scribe::Thrift::scribe
PerlModule Thrift::Socket
PerlModule Thrift::FramedTransport
PerlModule Thrift::BinaryProtocol
PerlModule Apache2::RequestRec
PerlModule Apache2::RequestIO
PerlModule Apache2::RequestUtil
PerlModule APR::Table
</IfModule>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment