Skip to content

Instantly share code, notes, and snippets.

@greenlion
Created February 27, 2014 05:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save greenlion/9244835 to your computer and use it in GitHub Desktop.
Save greenlion/9244835 to your computer and use it in GitHub Desktop.
A sample data generator for web traffic time series log file
<?php
/* a sample of traffic data from the real website, deduplicated and REST URLs digested */
$traffic = unserialize(file_get_contents('seed.dat')) or die('Could not unserialize seed.dat');
/* 20% chance by default */
function chance($greater_than=20, $bottom=1, $top = 100) {
$toss = mt_rand($bottom, $top);
if($greater_than >= $toss) return true; else return false;
}
function gen_ip() {
return(sprintf("%s.%s.%s.%s", mt_rand(1,255), mt_rand(1,255), mt_rand(1,255), mt_rand(1,255)));
}
/* for a given page in the data sample, generate some traffic
global $traffic contains the HTTP method, URI, etc, and the basic stats that will be modified (see top)
returns: a pipe separated string of data to be appened to a flat file for loading with LOAD DATA INFILE
*/
function gen_page_hit($page, $info, $traffic_is_slow, $is_stampede, $year, $month, $day, $hour, $min, $sec,$remote_ip, $skip) {
global $traffic;
global $ips;
// Get the template data for this request
$data = $traffic['digest'][$page . $info['method'] . $info['status_code'] . $info['protocol']];
$page = trim($page);
/* these are not affected by slowdown and need to be popped off for output */
$method = array_shift($data);
$status_code = array_shift($data);
$protocol = array_shift($data);
/* how fast is the request compared to baseline from $traffic */
$speeddiff = 1;
if($is_stampede) {
$speediff = mt_rand(250, 1000) / 100; // stampedes always slow things down extra
} elseif($traffic_is_slow) {
$speeddiff = mt_rand(125,300) / 100;
} else {
/* random variation */
$speeddiff = mt_rand(80,125) / 100;
}
$data['apache_service_time_us'] = floor($data['apache_service_time_us'] * $speeddiff);
foreach($data as $k => $v) {
if($k == 'status_code') continue;
/* times and some other counters go up in slow periods but some are not fractional */
if(strstr($k, '_t') !== false || strstr($k, 'cpu') !== false) {
$data[$k] *= $speeddiff;
} elseif(strstr($k, '_c') || strstr($k, 'bytes') !== false || strstr($k, 'memory')) {
$data[$k] = floor($v *= $speeddiff);
}
}
if($skip) return false;
/* output */
$request = array();
/* these were affected by randomization but need to be popped off */
$bytes_recv = array_shift($data);
$bytes_sent = array_shift($data);
$apache = array_shift($data);
/* pad the date portions */
if($month < 10) $month = "0$month";
if($day < 10) $day = "0$day";
if($hour < 10) $hour = "0$hour";
if($min < 10) $min = "0$min";
if($sec < 10) $sec = "0$sec";
$request['request_order'] = 0;
$request['request_time'] = "\"$year-$month-$day $hour:$min:$sec\"";
$request['remote_ip'] = '"' . $remote_ip . '"';
$request['bytes_received'] = $bytes_recv; /* popped */
$request['bytes_sent'] = $bytes_sent; /* popped */
$request['apache_service_time_us'] = $apache;/* popped */
$request['filename'] = '"app.php"';
$request['url_path'] = '"' . $page . '"';
$request['protocol'] = '"' . $protocol . '"';
$request['method'] = '"' . $method . '"';
$request['query_string'] = '""';
$request['status_code'] = $status_code;
$request['server_name'] = '"test.server.com"';
$cpu1 = array_shift($data); #system
$cpu2 = array_shift($data); #user
$cpu3 = array_shift($data); #total
$mem = array_shift($data);
$request['cpu_time_total'] = $cpu1; /* shifted*/
$request['cpu_time_user'] = $cpu2; /* shifted*/
$request['cpu_time_system'] = $cpu3; /* shifted*/
$request['memory_usage'] = $mem; /* shifted*/
$request['request_id'] = '"' . md5(microtime(true)) . '"';
/*push the rest of the metrics onto the request*/
foreach($data as $k => $v) {
$request[$k] = $v;
}
return join("|", $request);
}
#the first argument is a config file with settings to use.
if(count($argv) > 1) {
require_once($argv[1]);
} else {
require_once('2G.settings'); #see below
}
#deterministically seed the generator so that it always generates the same data set for the same parameters
#but an interestingly unique different data set for different parameters
define('DATE_SEED', START_YEAR + START_MONTH + END_MONTH+ END_DAY);
define('CUST_SEED', CUST_COUNT + MULT_MIN + MULT_MAX );
define('SLOW_SEED', SLOW_5MIN_PCT + HOUR_ALWAYS_SLOW + CACHE_STAMPEDES + SLOW_HOUR_MAX_ADD_US);
mt_srand(DATE_SEED + CUST_SEED + SLOW_SEED);
/* RANDOM GENERATION STARTS HERE*/
// for simplicity each customer comes from a single IP
$ips = array();
for($i=1;$i<=CUST_COUNT;++$i) {
$ips[$i] = gen_ip();
}
/* like mysql limit, generate all values, but only output those past here */
define('SKIP_TO_YEAR', 0);
define('SKIP_TO_MONTH', 0);
define('SKIP_TO_DAY', 0);
define('SKIP_TO_HOUR', 0);
$skip = true;
$cur_year = START_YEAR;
/* actual data generation loop */
for($cur_month = START_MONTH; $cur_month <= END_MONTH; ++$cur_month) {
for($cur_day = 1; $cur_day <= END_DAY; ++$cur_day) {
if($cur_month == 2 && $cur_day > 28) continue;
if(($cur_month == 4 || $cur_month == 6 || $cur_month == 9 || $cur_month == 11) && $cur_day > 30) continue;
for($cur_hour=0;$cur_hour<24;++$cur_hour) {
if($cur_year >= SKIP_TO_YEAR && $cur_month >= SKIP_TO_MONTH && $cur_day >= SKIP_TO_DAY && $cur_hour >= SKIP_TO_HOUR) $skip = false;
$traffic_is_slow = 0;
if(HOUR_ALWAYS_SLOW === $cur_hour) $traffic_is_slow = 1;
for($cust = 1;$cust <= CUST_COUNT; ++$cust) {
$traffic_multiplier = mt_rand(MULT_MIN, MULT_MAX);
for($cur_min=0;$cur_min<=59;$cur_min += 15) {
$traffic_is_slow = chance(SLOW_5MIN_PCT);
if($cur_min === 0 && CACHE_STAMPEDES === TRUE) {
$is_stampede = true;
}
foreach($traffic['distribution'] as $page => $info) {
for($z=0;$z<$traffic_multiplier;++$z) {
if(!$traffic_is_slow) $this_traffic_is_slow = chance(5, 1, 1000); /* .5% chance that any random request will be slow */
$row = gen_page_hit($page, $info, $traffic_is_slow || $this_traffic_is_slow, $is_stampede, $cur_year, $cur_month, $cur_day, $cur_hour, $cur_min + mt_rand(0,4), mt_rand(0,59),$ips[$cust], $skip);
if($row !== false) echo $row . "\n";
}
}
$this_traffic_is_slow = false;
}
if($cur_min > 5) $is_stampede = false; /* stampedes are short lived events */
}
}
}
}
#Contents of 2G.settings (it is a PHP file):
<?php
/* This defines how many total customers are viewing the site.*/
define('CUST_COUNT', 300);
/* Customers have 1 to 5x the sample traffic in traffic.dat in any one hour interval */
define('MULT_MIN', 1);
define('MULT_MAX', 5);
define('SLOW_5MIN_PCT', 2); /* chance that any 5m period will be slow */
define('HOUR_ALWAYS_SLOW', 0); /* simulate a backup */
define('CACHE_STAMPEDES', TRUE); /* will put a spike of response time and database queries at the start of each hour*/
define('SLOW_HOUR_MAX_ADD_US', 1543210); /* 1.543210 seconds max added to each request */
/* date range to generate (one year by default) */
define('START_YEAR', 2013);
define('START_MONTH', 1);
define('END_MONTH', 1);
define('END_DAY', 1);
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment