Created
February 27, 2014 05:20
-
-
Save greenlion/9244835 to your computer and use it in GitHub Desktop.
A sample data generator for web traffic time series log file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* a sample of traffic data from the real website, deduplicated and REST URLs digested */ | |
$traffic = unserialize(file_get_contents('seed.dat')) or die('Could not unserialize seed.dat'); | |
/* 20% chance by default */ | |
function chance($greater_than=20, $bottom=1, $top = 100) { | |
$toss = mt_rand($bottom, $top); | |
if($greater_than >= $toss) return true; else return false; | |
} | |
function gen_ip() { | |
return(sprintf("%s.%s.%s.%s", mt_rand(1,255), mt_rand(1,255), mt_rand(1,255), mt_rand(1,255))); | |
} | |
/* for a given page in the data sample, generate some traffic | |
global $traffic contains the HTTP method, URI, etc, and the basic stats that will be modified (see top) | |
returns: a pipe separated string of data to be appened to a flat file for loading with LOAD DATA INFILE | |
*/ | |
function gen_page_hit($page, $info, $traffic_is_slow, $is_stampede, $year, $month, $day, $hour, $min, $sec,$remote_ip, $skip) { | |
global $traffic; | |
global $ips; | |
// Get the template data for this request | |
$data = $traffic['digest'][$page . $info['method'] . $info['status_code'] . $info['protocol']]; | |
$page = trim($page); | |
/* these are not affected by slowdown and need to be popped off for output */ | |
$method = array_shift($data); | |
$status_code = array_shift($data); | |
$protocol = array_shift($data); | |
/* how fast is the request compared to baseline from $traffic */ | |
$speeddiff = 1; | |
if($is_stampede) { | |
$speediff = mt_rand(250, 1000) / 100; // stampedes always slow things down extra | |
} elseif($traffic_is_slow) { | |
$speeddiff = mt_rand(125,300) / 100; | |
} else { | |
/* random variation */ | |
$speeddiff = mt_rand(80,125) / 100; | |
} | |
$data['apache_service_time_us'] = floor($data['apache_service_time_us'] * $speeddiff); | |
foreach($data as $k => $v) { | |
if($k == 'status_code') continue; | |
/* times and some other counters go up in slow periods but some are not fractional */ | |
if(strstr($k, '_t') !== false || strstr($k, 'cpu') !== false) { | |
$data[$k] *= $speeddiff; | |
} elseif(strstr($k, '_c') || strstr($k, 'bytes') !== false || strstr($k, 'memory')) { | |
$data[$k] = floor($v *= $speeddiff); | |
} | |
} | |
if($skip) return false; | |
/* output */ | |
$request = array(); | |
/* these were affected by randomization but need to be popped off */ | |
$bytes_recv = array_shift($data); | |
$bytes_sent = array_shift($data); | |
$apache = array_shift($data); | |
/* pad the date portions */ | |
if($month < 10) $month = "0$month"; | |
if($day < 10) $day = "0$day"; | |
if($hour < 10) $hour = "0$hour"; | |
if($min < 10) $min = "0$min"; | |
if($sec < 10) $sec = "0$sec"; | |
$request['request_order'] = 0; | |
$request['request_time'] = "\"$year-$month-$day $hour:$min:$sec\""; | |
$request['remote_ip'] = '"' . $remote_ip . '"'; | |
$request['bytes_received'] = $bytes_recv; /* popped */ | |
$request['bytes_sent'] = $bytes_sent; /* popped */ | |
$request['apache_service_time_us'] = $apache;/* popped */ | |
$request['filename'] = '"app.php"'; | |
$request['url_path'] = '"' . $page . '"'; | |
$request['protocol'] = '"' . $protocol . '"'; | |
$request['method'] = '"' . $method . '"'; | |
$request['query_string'] = '""'; | |
$request['status_code'] = $status_code; | |
$request['server_name'] = '"test.server.com"'; | |
$cpu1 = array_shift($data); #system | |
$cpu2 = array_shift($data); #user | |
$cpu3 = array_shift($data); #total | |
$mem = array_shift($data); | |
$request['cpu_time_total'] = $cpu1; /* shifted*/ | |
$request['cpu_time_user'] = $cpu2; /* shifted*/ | |
$request['cpu_time_system'] = $cpu3; /* shifted*/ | |
$request['memory_usage'] = $mem; /* shifted*/ | |
$request['request_id'] = '"' . md5(microtime(true)) . '"'; | |
/*push the rest of the metrics onto the request*/ | |
foreach($data as $k => $v) { | |
$request[$k] = $v; | |
} | |
return join("|", $request); | |
} | |
#the first argument is a config file with settings to use. | |
if(count($argv) > 1) { | |
require_once($argv[1]); | |
} else { | |
require_once('2G.settings'); #see below | |
} | |
#deterministically seed the generator so that it always generates the same data set for the same parameters | |
#but an interestingly unique different data set for different parameters | |
define('DATE_SEED', START_YEAR + START_MONTH + END_MONTH+ END_DAY); | |
define('CUST_SEED', CUST_COUNT + MULT_MIN + MULT_MAX ); | |
define('SLOW_SEED', SLOW_5MIN_PCT + HOUR_ALWAYS_SLOW + CACHE_STAMPEDES + SLOW_HOUR_MAX_ADD_US); | |
mt_srand(DATE_SEED + CUST_SEED + SLOW_SEED); | |
/* RANDOM GENERATION STARTS HERE*/ | |
// for simplicity each customer comes from a single IP | |
$ips = array(); | |
for($i=1;$i<=CUST_COUNT;++$i) { | |
$ips[$i] = gen_ip(); | |
} | |
/* like mysql limit, generate all values, but only output those past here */ | |
define('SKIP_TO_YEAR', 0); | |
define('SKIP_TO_MONTH', 0); | |
define('SKIP_TO_DAY', 0); | |
define('SKIP_TO_HOUR', 0); | |
$skip = true; | |
$cur_year = START_YEAR; | |
/* actual data generation loop */ | |
for($cur_month = START_MONTH; $cur_month <= END_MONTH; ++$cur_month) { | |
for($cur_day = 1; $cur_day <= END_DAY; ++$cur_day) { | |
if($cur_month == 2 && $cur_day > 28) continue; | |
if(($cur_month == 4 || $cur_month == 6 || $cur_month == 9 || $cur_month == 11) && $cur_day > 30) continue; | |
for($cur_hour=0;$cur_hour<24;++$cur_hour) { | |
if($cur_year >= SKIP_TO_YEAR && $cur_month >= SKIP_TO_MONTH && $cur_day >= SKIP_TO_DAY && $cur_hour >= SKIP_TO_HOUR) $skip = false; | |
$traffic_is_slow = 0; | |
if(HOUR_ALWAYS_SLOW === $cur_hour) $traffic_is_slow = 1; | |
for($cust = 1;$cust <= CUST_COUNT; ++$cust) { | |
$traffic_multiplier = mt_rand(MULT_MIN, MULT_MAX); | |
for($cur_min=0;$cur_min<=59;$cur_min += 15) { | |
$traffic_is_slow = chance(SLOW_5MIN_PCT); | |
if($cur_min === 0 && CACHE_STAMPEDES === TRUE) { | |
$is_stampede = true; | |
} | |
foreach($traffic['distribution'] as $page => $info) { | |
for($z=0;$z<$traffic_multiplier;++$z) { | |
if(!$traffic_is_slow) $this_traffic_is_slow = chance(5, 1, 1000); /* .5% chance that any random request will be slow */ | |
$row = gen_page_hit($page, $info, $traffic_is_slow || $this_traffic_is_slow, $is_stampede, $cur_year, $cur_month, $cur_day, $cur_hour, $cur_min + mt_rand(0,4), mt_rand(0,59),$ips[$cust], $skip); | |
if($row !== false) echo $row . "\n"; | |
} | |
} | |
$this_traffic_is_slow = false; | |
} | |
if($cur_min > 5) $is_stampede = false; /* stampedes are short lived events */ | |
} | |
} | |
} | |
} | |
#Contents of 2G.settings (it is a PHP file): | |
<?php | |
/* This defines how many total customers are viewing the site.*/ | |
define('CUST_COUNT', 300); | |
/* Customers have 1 to 5x the sample traffic in traffic.dat in any one hour interval */ | |
define('MULT_MIN', 1); | |
define('MULT_MAX', 5); | |
define('SLOW_5MIN_PCT', 2); /* chance that any 5m period will be slow */ | |
define('HOUR_ALWAYS_SLOW', 0); /* simulate a backup */ | |
define('CACHE_STAMPEDES', TRUE); /* will put a spike of response time and database queries at the start of each hour*/ | |
define('SLOW_HOUR_MAX_ADD_US', 1543210); /* 1.543210 seconds max added to each request */ | |
/* date range to generate (one year by default) */ | |
define('START_YEAR', 2013); | |
define('START_MONTH', 1); | |
define('END_MONTH', 1); | |
define('END_DAY', 1); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment