Skip to content

Instantly share code, notes, and snippets.

@polyfractal
Last active December 16, 2015 06:09
Show Gist options
  • Star 5 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save polyfractal/5389101 to your computer and use it in GitHub Desktop.
Save polyfractal/5389101 to your computer and use it in GitHub Desktop.
<?php
require 'vendor/autoload.php';
use Sherlock\Sherlock;
function pprint($value) {
print_r($value);
echo "\r\n";
}
$sherlock = new Sherlock();
$sherlock->addNode("localhost");
$indexName = 'dodgers';
$typeName = 'data';
$events = file_get_contents('data/Dodgers.data');
$events = explode("\n", $events);
$mappings = array();
$mappings[] = Sherlock::mappingBuilder($typeName)->Date()->field('timestamp');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->type('float')->field('value');
$index = $sherlock->index($indexName)->mappings($mappings)->create();
$docs = $sherlock->document()->index($indexName)->type($typeName);
foreach ($events as $event) {
$data = explode(",", $event);
$docData = array();
if (trim($data[0]) == '') {
continue;
}
$docData['timestamp'] = strtotime(trim($data[0]));
$docData['value'] = trim($data[1]);
$docs->document($docData);
}
$response = $docs->execute();
<?php
require 'vendor/autoload.php';
use Sherlock\Sherlock;
error_reporting(E_ERROR | E_WARNING | E_PARSE);
function pprint($value) {
print_r($value);
echo "\r\n";
}
//Snagged from here:
//http://cad.cx/blog/2008/06/30/single-pass-standard-deviation-in-php/
//modified slightly to return mean + variance
function stddev($array){
//Don Knuth is the $deity of algorithms
//http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#III._On-line_algorithm
$n = 0;
$mean = 0;
$M2 = 0;
foreach($array as $x){
$n++;
$delta = $x - $mean;
$mean = $mean + $delta/$n;
$M2 = $M2 + $delta*($x - $mean);
}
$variance = $M2/$n;
return array("mean" => $mean, "stddev" => sqrt($variance), "variance" => $variance);
}
function vwFacets($sherlock, $time) {
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-24 hour", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$query = Sherlock::queryBuilder()->MatchAll();
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_5m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('5m')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_10m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('10m')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_30m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('30m')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_1h_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4)->size(0);
$response = $request->execute();
$facets = array("time_24hr_5m_bucket",
"time_24hr_10m_bucket",
"time_24hr_30m_bucket",
"time_24hr_1h_bucket");
$out = "";
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
//The data granularity is 5m, so we only need to check "total" on the
//5min bucket, since the other metrics will equal "total
if ($facet === 'time_24hr_5m_bucket') {
$combinations = array("total");
} else {
$combinations = array("total", "min", "max", "mean");
}
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-7 days", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_1hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_2hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('2h')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_6hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('6h')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_12hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('12h')
->facet_filter($filter);
$facet5 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_24hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('24h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4, $facet5)->size(0);
$response = $request->execute();
$facets = array("time_7d_1hr_bucket",
"time_7d_2hr_bucket",
"time_7d_6hr_bucket",
"time_7d_12hr_bucket",
"time_7d_24hr_bucket");
//all of these facets are >5min, so we can use all combinations for each facet
$combinations = array("total", "min", "max", "mean");
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
if (strpos(strtolower($out),"inf") || strpos(strtolower($out),"nan")) {
pprint($out);
exit;
}
return $out;
}
$counter = 0;
$time = time();
$minAttendance = 1000000000;
$maxAttendance = 0;
$label = -1;
$history = array();
$mse = 0;
$sherlock = new \Sherlock\Sherlock();
$sherlock->addNode("localhost");
$indexName = 'dodgerstestc';
$typeName = 'data';
//create the ES index to house our prediction data
$mappings = array();
$mappings[] = Sherlock::mappingBuilder($typeName)->Date()->field('timestamp');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('value')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('baseballgame')->type('integer');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('prediction')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('residual')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('stdresidual')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('rmse')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('variance')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('stddev')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('mean')->type('float');
$index = $sherlock->index($indexName)->mappings($mappings)->create();
/**
* Load the baseball game event data. Could also load this out of ES
*/
$events = file_get_contents('data/Dodgers.events');
$events = explode("\n", $events);
foreach ($events as $event) {
$data = explode(",", $event);
$startTime = strtotime($data[0].' '.$data[1]);
//Game data is labeled during the game itself, but traffic lags about two hours after the game
//because that is when the game lets out. We are going to adjust our start/end times to
//shift the label onto the traffic spike, instead of the game
$endTime = strtotime("-20 minutes",round(strtotime($data[0].' '.$data[2])/300)*300);
$eventData[$endTime]['end'] = strtotime("+2 hours",$endTime);
$eventData[$endTime]['attendance'] = $data[3];
if ($data[3] < $minAttendance) {
$minAttendance = $data[3];
}
if ($data[3] > $maxAttendance) {
$maxAttendance = $data[3];
}
}
/**
* Load up the test data
*/
$events = file_get_contents('data/Dodgers.test');
$events = explode("\n", $events);
foreach ($events as $event) {
$data = explode(",", $event);
$time = strtotime(trim($data[0]));
//if this datapoint coincides with an event timestamp, set the label to one
//and leave it that way until we move out of the event time range
//This works because all timestamps are rounded to five minute intervals
if (isset($eventData[$time]['end'])) {
$label = 1;
$eventEnd = $eventData[$time]['end'];
$eventAttendance = $eventData[$time]['attendance'];
}
if ($label == 1 && $time > $eventEnd) {
$label = -1;
}
//docData holds the document we are going to insert into ES
$docData = array();
$docData['value'] = (int)trim($data[1]) / 100;
$docData['timestamp'] = date('Y-m-d\TH:i:sO', $time);
$docData['baseballgame'] = $label;
//facetData contains the large feature set derived from the facetedtime buckets
$facetData = vwFacets($sherlock, $time);
$vwData = "";
$vwData = "|date_general ";
$vwData .= "date_year:".date("Y", $time)." date_year_".date("Y", $time).":1 ";
$vwData .= "date_month:".date("m", $time)." date_month_".date("m", $time).":1 ";
$vwData .= "date_day:".date("d", $time)." date_day_".date("d", $time).":1 ";
$vwData .= "date_dayweek:".date("N", $time)." date_dayweek_".date("N", $time).":1 ";
$vwData .= "date_weekyear:".date("W", $time)." date_weekyear_".date("W", $time).":1 ";
$vwData .= "date_dayyear:".date("z", $time)." date_dayyear_".date("z", $time).":1 ";
$vwData .= "date_monthyear:".date("n", $time)." date_monthyear_".date("n", $time).":1 ";
$vwData .= "date_hour:".date("H", $time)." date_hour_".date("H", $time).":1 ";
$vwData .= "date_minute:".date("i", $time)." date_minute_".date("i", $time).":1 ";
$vwData .= "date_ampm_".date("a", $time).":1 ";
$vwData .= $facetData."\n";
$fp = fsockopen("localhost", 26542, $errno, $errstr, 3);
if (!$fp)
{
//Something went wrong
fclose($fp);
}
else
{
//write to VW's socket, get data and close
fwrite($fp, $vwData);
$returnData = trim(fgets($fp, 256));
fclose($fp);
$returnData = explode(" ", $returnData);
$docData['prediction'] = $returnData[0];
//residual is Actual - Predicted
$docData['residual'] = $docData['value'] - $docData['prediction'];
//history is used to calculate residual mean/variance/stddev
$history[] = $docData['residual'];
$stats = stddev($history);
$docData['mean'] = $stats['mean'];
$docData['variance'] = $stats['variance'];
$docData['stddev'] = $stats['stddev'];
$docData['stdresidual'] = ($docData['stddev'] == 0) ? 0 :($docData['residual'] - $docData['mean']) / $docData['stddev'];
$mse += pow($docData['residual'],2);
$docData['rmse'] = sqrt($mse / $counter);
pprint($docData);
//insert into ES
$docs = $sherlock->document()->index($indexName)->type($typeName)->document($docData)->execute();
}
$counter += 1;
}
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
require 'vendor/autoload.php';
use Sherlock\Sherlock;
function pprint($value) {
print_r($value);
echo "\r\n";
}
/**
* @param \Sherlock\Sherlock $sherlock
* @param $time
* @return string
*/
function vwFacets($sherlock, $time) {
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-24 hour", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$query = Sherlock::queryBuilder()->MatchAll();
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_5m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('5m')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_10m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('10m')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_30m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('30m')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_1h_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4)->size(0);
$response = $request->execute();
$facets = array("time_24hr_5m_bucket",
"time_24hr_10m_bucket",
"time_24hr_30m_bucket",
"time_24hr_1h_bucket");
$out = "";
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
//The data granularity is 5m, so we only need to check "total" on the
//5min bucket, since the other metrics will equal "total
if ($facet === 'time_24hr_5m_bucket') {
$combinations = array("total");
} else {
$combinations = array("total", "min", "max", "mean");
}
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-7 days", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_1hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_2hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('2h')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_6hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('6h')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_12hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('12h')
->facet_filter($filter);
$facet5 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_24hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('24h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4, $facet5)->size(0);
$response = $request->execute();
$facets = array("time_7d_1hr_bucket",
"time_7d_2hr_bucket",
"time_7d_6hr_bucket",
"time_7d_12hr_bucket",
"time_7d_24hr_bucket");
//all of these facets are >5min, so we can use all combinations for each facet
$combinations = array("total", "min", "max", "mean");
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
if (strpos(strtolower($out),"inf") || strpos(strtolower($out),"nan")) {
pprint($out);
exit;
}
return $out;
}
$counter = 0;
$sherlock = new \Sherlock\Sherlock();
$sherlock->addNode("localhost");
$events = file_get_contents('data/Dodgers.events');
$events = explode("\n", $events);
$minAttendance = 1000000000;
$maxAttendance = 0;
foreach ($events as $event) {
$data = explode(",", $event);
$startTime = strtotime($data[0].' '.$data[1]);
$endTime = strtotime("-20 minutes",round(strtotime($data[0].' '.$data[2])/300)*300);
$eventData[$endTime]['end'] = strtotime("+2 hours",$endTime);
$eventData[$endTime]['attendance'] = $data[3];
if ($data[3] < $minAttendance) {
$minAttendance = $data[3];
}
if ($data[3] > $maxAttendance) {
$maxAttendance = $data[3];
}
}
$events = file_get_contents('data/Dodgers.train');
$events = explode("\n", $events);
$eventAttendance = 0;
$labelBinary = -1;
$labelReal = 0;
$counter = 0;
$outReal = "";
$outBinary = "";
foreach ($events as $event) {
$data = explode(",", $event);
$time = strtotime(trim($data[0]));
/**
* if this time is in our master Dodgers event list,
* a baseball game has just started. Set label = 1
* and don't change it until the timestamp is after the end
* of the game
*/
if (isset($eventData[$time]['end'])) {
$labelBinary = 1;
$eventEnd = $eventData[$time]['end'];
$eventAttendance = $eventData[$time]['attendance'];
}
/**
* Game has ended, set label back to -1
*/
if ($labelBinary == 1 && $time > $eventEnd) {
$labelBinary = -1;
}
//label2 is used for real-valued classification, where we want the value of the cars
//and not baseball game
$labelReal = (int)trim($data[1]) / 100;
//obtain the various time buckets
$facetData = vwFacets($sherlock, $time);
//time is the label, then start the features
$out = " ".$time."|dategeneral ";
$out .= "date_year:".date("Y", $time)." date_year_".date("Y", $time).":1 ";
$out .= "date_month:".date("m", $time)." date_month_".date("m", $time).":1 ";
$out .= "date_day:".date("d", $time)." date_day_".date("d", $time).":1 ";
$out .= "date_dayweek:".date("N", $time)." date_dayweek_".date("N", $time).":1 ";
$out .= "date_weekyear:".date("W", $time)." date_weekyear_".date("W", $time).":1 ";
$out .= "date_dayyear:".date("z", $time)." date_dayyear_".date("z", $time).":1 ";
$out .= "date_monthyear:".date("n", $time)." date_monthyear_".date("n", $time).":1 ";
$out .= "date_hour:".date("H", $time)." date_hour_".date("H", $time).":1 ";
$out .= "date_minute:".date("i", $time)." date_minute_".date("i", $time).":1 ";
$out .= "date_ampm_".date("a", $time).":1 ";
$out .= $facetData."\n";
$outReal .= $labelReal.$out;
//for the binary classification, we label things -1 or 1
//and importance weight the positive examples
if ($labelBinary == 1) {
$weight = ($eventAttendance / $maxAttendance);
$outBinary .= $labelBinary." ".$weight.$out;
} else {
$outBinary .= $labelBinary." 0.01".$out;
}
echo ".";
if ($counter % 1000 == 0) {
file_put_contents("data/train.real", $outReal, FILE_APPEND);
$outReal = "";
file_put_contents("data/train.binary", $outBinary, FILE_APPEND);
$outBinary = "";
}
$counter += 1;
}
file_put_contents("data/train.real", $outReal, FILE_APPEND);
file_put_contents("data/train.binary", $outBinary, FILE_APPEND);
@wild-thomas
Copy link

Hi!
I really appreciate your work. Unfortunately your website http://euphonious-intuition.com is offline. :(
Could you fix it?

@joernroeder
Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment