Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
<?php
require 'vendor/autoload.php';
use Sherlock\Sherlock;
function pprint($value) {
print_r($value);
echo "\r\n";
}
$sherlock = new Sherlock();
$sherlock->addNode("localhost");
$indexName = 'dodgers';
$typeName = 'data';
$events = file_get_contents('data/Dodgers.data');
$events = explode("\n", $events);
$mappings = array();
$mappings[] = Sherlock::mappingBuilder($typeName)->Date()->field('timestamp');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->type('float')->field('value');
$index = $sherlock->index($indexName)->mappings($mappings)->create();
$docs = $sherlock->document()->index($indexName)->type($typeName);
foreach ($events as $event) {
$data = explode(",", $event);
$docData = array();
if (trim($data[0]) == '') {
continue;
}
$docData['timestamp'] = strtotime(trim($data[0]));
$docData['value'] = trim($data[1]);
$docs->document($docData);
}
$response = $docs->execute();
<?php
require 'vendor/autoload.php';
use Sherlock\Sherlock;
error_reporting(E_ERROR | E_WARNING | E_PARSE);
function pprint($value) {
print_r($value);
echo "\r\n";
}
//Snagged from here:
//http://cad.cx/blog/2008/06/30/single-pass-standard-deviation-in-php/
//modified slightly to return mean + variance
function stddev($array){
//Don Knuth is the $deity of algorithms
//http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#III._On-line_algorithm
$n = 0;
$mean = 0;
$M2 = 0;
foreach($array as $x){
$n++;
$delta = $x - $mean;
$mean = $mean + $delta/$n;
$M2 = $M2 + $delta*($x - $mean);
}
$variance = $M2/$n;
return array("mean" => $mean, "stddev" => sqrt($variance), "variance" => $variance);
}
function vwFacets($sherlock, $time) {
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-24 hour", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$query = Sherlock::queryBuilder()->MatchAll();
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_5m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('5m')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_10m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('10m')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_30m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('30m')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_1h_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4)->size(0);
$response = $request->execute();
$facets = array("time_24hr_5m_bucket",
"time_24hr_10m_bucket",
"time_24hr_30m_bucket",
"time_24hr_1h_bucket");
$out = "";
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
//The data granularity is 5m, so we only need to check "total" on the
//5min bucket, since the other metrics will equal "total
if ($facet === 'time_24hr_5m_bucket') {
$combinations = array("total");
} else {
$combinations = array("total", "min", "max", "mean");
}
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-7 days", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_1hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_2hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('2h')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_6hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('6h')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_12hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('12h')
->facet_filter($filter);
$facet5 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_24hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('24h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4, $facet5)->size(0);
$response = $request->execute();
$facets = array("time_7d_1hr_bucket",
"time_7d_2hr_bucket",
"time_7d_6hr_bucket",
"time_7d_12hr_bucket",
"time_7d_24hr_bucket");
//all of these facets are >5min, so we can use all combinations for each facet
$combinations = array("total", "min", "max", "mean");
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
if (strpos(strtolower($out),"inf") || strpos(strtolower($out),"nan")) {
pprint($out);
exit;
}
return $out;
}
$counter = 0;
$time = time();
$minAttendance = 1000000000;
$maxAttendance = 0;
$label = -1;
$history = array();
$mse = 0;
$sherlock = new \Sherlock\Sherlock();
$sherlock->addNode("localhost");
$indexName = 'dodgerstestc';
$typeName = 'data';
//create the ES index to house our prediction data
$mappings = array();
$mappings[] = Sherlock::mappingBuilder($typeName)->Date()->field('timestamp');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('value')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('baseballgame')->type('integer');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('prediction')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('residual')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('stdresidual')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('rmse')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('variance')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('stddev')->type('float');
$mappings[] = Sherlock::mappingBuilder($typeName)->Number()->field('mean')->type('float');
$index = $sherlock->index($indexName)->mappings($mappings)->create();
/**
* Load the baseball game event data. Could also load this out of ES
*/
$events = file_get_contents('data/Dodgers.events');
$events = explode("\n", $events);
foreach ($events as $event) {
$data = explode(",", $event);
$startTime = strtotime($data[0].' '.$data[1]);
//Game data is labeled during the game itself, but traffic lags about two hours after the game
//because that is when the game lets out. We are going to adjust our start/end times to
//shift the label onto the traffic spike, instead of the game
$endTime = strtotime("-20 minutes",round(strtotime($data[0].' '.$data[2])/300)*300);
$eventData[$endTime]['end'] = strtotime("+2 hours",$endTime);
$eventData[$endTime]['attendance'] = $data[3];
if ($data[3] < $minAttendance) {
$minAttendance = $data[3];
}
if ($data[3] > $maxAttendance) {
$maxAttendance = $data[3];
}
}
/**
* Load up the test data
*/
$events = file_get_contents('data/Dodgers.test');
$events = explode("\n", $events);
foreach ($events as $event) {
$data = explode(",", $event);
$time = strtotime(trim($data[0]));
//if this datapoint coincides with an event timestamp, set the label to one
//and leave it that way until we move out of the event time range
//This works because all timestamps are rounded to five minute intervals
if (isset($eventData[$time]['end'])) {
$label = 1;
$eventEnd = $eventData[$time]['end'];
$eventAttendance = $eventData[$time]['attendance'];
}
if ($label == 1 && $time > $eventEnd) {
$label = -1;
}
//docData holds the document we are going to insert into ES
$docData = array();
$docData['value'] = (int)trim($data[1]) / 100;
$docData['timestamp'] = date('Y-m-d\TH:i:sO', $time);
$docData['baseballgame'] = $label;
//facetData contains the large feature set derived from the facetedtime buckets
$facetData = vwFacets($sherlock, $time);
$vwData = "";
$vwData = "|date_general ";
$vwData .= "date_year:".date("Y", $time)." date_year_".date("Y", $time).":1 ";
$vwData .= "date_month:".date("m", $time)." date_month_".date("m", $time).":1 ";
$vwData .= "date_day:".date("d", $time)." date_day_".date("d", $time).":1 ";
$vwData .= "date_dayweek:".date("N", $time)." date_dayweek_".date("N", $time).":1 ";
$vwData .= "date_weekyear:".date("W", $time)." date_weekyear_".date("W", $time).":1 ";
$vwData .= "date_dayyear:".date("z", $time)." date_dayyear_".date("z", $time).":1 ";
$vwData .= "date_monthyear:".date("n", $time)." date_monthyear_".date("n", $time).":1 ";
$vwData .= "date_hour:".date("H", $time)." date_hour_".date("H", $time).":1 ";
$vwData .= "date_minute:".date("i", $time)." date_minute_".date("i", $time).":1 ";
$vwData .= "date_ampm_".date("a", $time).":1 ";
$vwData .= $facetData."\n";
$fp = fsockopen("localhost", 26542, $errno, $errstr, 3);
if (!$fp)
{
//Something went wrong
fclose($fp);
}
else
{
//write to VW's socket, get data and close
fwrite($fp, $vwData);
$returnData = trim(fgets($fp, 256));
fclose($fp);
$returnData = explode(" ", $returnData);
$docData['prediction'] = $returnData[0];
//residual is Actual - Predicted
$docData['residual'] = $docData['value'] - $docData['prediction'];
//history is used to calculate residual mean/variance/stddev
$history[] = $docData['residual'];
$stats = stddev($history);
$docData['mean'] = $stats['mean'];
$docData['variance'] = $stats['variance'];
$docData['stddev'] = $stats['stddev'];
$docData['stdresidual'] = ($docData['stddev'] == 0) ? 0 :($docData['residual'] - $docData['mean']) / $docData['stddev'];
$mse += pow($docData['residual'],2);
$docData['rmse'] = sqrt($mse / $counter);
pprint($docData);
//insert into ES
$docs = $sherlock->document()->index($indexName)->type($typeName)->document($docData)->execute();
}
$counter += 1;
}
<?php
error_reporting(E_ERROR | E_WARNING | E_PARSE);
require 'vendor/autoload.php';
use Sherlock\Sherlock;
function pprint($value) {
print_r($value);
echo "\r\n";
}
/**
* @param \Sherlock\Sherlock $sherlock
* @param $time
* @return string
*/
function vwFacets($sherlock, $time) {
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-24 hour", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$query = Sherlock::queryBuilder()->MatchAll();
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_5m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('5m')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_10m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('10m')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_30m_bucket')
->key_field('timestamp')
->value_field('value')
->interval('30m')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_24hr_1h_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4)->size(0);
$response = $request->execute();
$facets = array("time_24hr_5m_bucket",
"time_24hr_10m_bucket",
"time_24hr_30m_bucket",
"time_24hr_1h_bucket");
$out = "";
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
//The data granularity is 5m, so we only need to check "total" on the
//5min bucket, since the other metrics will equal "total
if ($facet === 'time_24hr_5m_bucket') {
$combinations = array("total");
} else {
$combinations = array("total", "min", "max", "mean");
}
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
$timeFrom = date("Y-m-d\TH:i:sO", strtotime("-7 days", $time));
$timeTo = date("Y-m-d\TH:i:sO", $time);
$filter = Sherlock::filterBuilder()->Range()->field('timestamp')->from($timeFrom)->to($timeTo);
$facet = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_1hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('1h')
->facet_filter($filter);
$facet2 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_2hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('2h')
->facet_filter($filter);
$facet3 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_6hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('6h')
->facet_filter($filter);
$facet4 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_12hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('12h')
->facet_filter($filter);
$facet5 = Sherlock::facetBuilder()->DateHistogram()->facetname('time_7d_24hr_bucket')
->key_field('timestamp')
->value_field('value')
->interval('24h')
->facet_filter($filter);
$request = $sherlock->search()->index('dodgers')->type('data')->query($query)->facets($facet, $facet2, $facet3, $facet4, $facet5)->size(0);
$response = $request->execute();
$facets = array("time_7d_1hr_bucket",
"time_7d_2hr_bucket",
"time_7d_6hr_bucket",
"time_7d_12hr_bucket",
"time_7d_24hr_bucket");
//all of these facets are >5min, so we can use all combinations for each facet
$combinations = array("total", "min", "max", "mean");
foreach($facets as $facet) {
//namespace
$out .= " |".$facet;
$c = 0;
foreach ($response->facets[$facet]['entries'] as $entry) {
foreach($combinations as $value) {
//Make sure we don't log(0)
if ($entry[$value] == 0) {
$logValue = 0;
} else {
//absolute value is taken so we don't get complex results
$logValue = log(abs($entry[$value]),2);
}
//log feature
$out .= " ".$facet."_".$value."_log_".$c.":".$logValue;
//linear feature
$out .= " ".$facet."_".$value."_".$c.":".$entry[$value];
//categorical feature, rounded so more examples share the same category
$out .= " ".$facet."_".$value."_".$c."_".round($entry[$value]).":1";
//categorical feature, no rounding so there are potentially very sparse features
$out .= " ".$facet."_".$value."_".$c."_".$entry[$value].":1";
}
$c += 1;
}
}
if (strpos(strtolower($out),"inf") || strpos(strtolower($out),"nan")) {
pprint($out);
exit;
}
return $out;
}
$counter = 0;
$sherlock = new \Sherlock\Sherlock();
$sherlock->addNode("localhost");
$events = file_get_contents('data/Dodgers.events');
$events = explode("\n", $events);
$minAttendance = 1000000000;
$maxAttendance = 0;
foreach ($events as $event) {
$data = explode(",", $event);
$startTime = strtotime($data[0].' '.$data[1]);
$endTime = strtotime("-20 minutes",round(strtotime($data[0].' '.$data[2])/300)*300);
$eventData[$endTime]['end'] = strtotime("+2 hours",$endTime);
$eventData[$endTime]['attendance'] = $data[3];
if ($data[3] < $minAttendance) {
$minAttendance = $data[3];
}
if ($data[3] > $maxAttendance) {
$maxAttendance = $data[3];
}
}
$events = file_get_contents('data/Dodgers.train');
$events = explode("\n", $events);
$eventAttendance = 0;
$labelBinary = -1;
$labelReal = 0;
$counter = 0;
$outReal = "";
$outBinary = "";
foreach ($events as $event) {
$data = explode(",", $event);
$time = strtotime(trim($data[0]));
/**
* if this time is in our master Dodgers event list,
* a baseball game has just started. Set label = 1
* and don't change it until the timestamp is after the end
* of the game
*/
if (isset($eventData[$time]['end'])) {
$labelBinary = 1;
$eventEnd = $eventData[$time]['end'];
$eventAttendance = $eventData[$time]['attendance'];
}
/**
* Game has ended, set label back to -1
*/
if ($labelBinary == 1 && $time > $eventEnd) {
$labelBinary = -1;
}
//label2 is used for real-valued classification, where we want the value of the cars
//and not baseball game
$labelReal = (int)trim($data[1]) / 100;
//obtain the various time buckets
$facetData = vwFacets($sherlock, $time);
//time is the label, then start the features
$out = " ".$time."|dategeneral ";
$out .= "date_year:".date("Y", $time)." date_year_".date("Y", $time).":1 ";
$out .= "date_month:".date("m", $time)." date_month_".date("m", $time).":1 ";
$out .= "date_day:".date("d", $time)." date_day_".date("d", $time).":1 ";
$out .= "date_dayweek:".date("N", $time)." date_dayweek_".date("N", $time).":1 ";
$out .= "date_weekyear:".date("W", $time)." date_weekyear_".date("W", $time).":1 ";
$out .= "date_dayyear:".date("z", $time)." date_dayyear_".date("z", $time).":1 ";
$out .= "date_monthyear:".date("n", $time)." date_monthyear_".date("n", $time).":1 ";
$out .= "date_hour:".date("H", $time)." date_hour_".date("H", $time).":1 ";
$out .= "date_minute:".date("i", $time)." date_minute_".date("i", $time).":1 ";
$out .= "date_ampm_".date("a", $time).":1 ";
$out .= $facetData."\n";
$outReal .= $labelReal.$out;
//for the binary classification, we label things -1 or 1
//and importance weight the positive examples
if ($labelBinary == 1) {
$weight = ($eventAttendance / $maxAttendance);
$outBinary .= $labelBinary." ".$weight.$out;
} else {
$outBinary .= $labelBinary." 0.01".$out;
}
echo ".";
if ($counter % 1000 == 0) {
file_put_contents("data/train.real", $outReal, FILE_APPEND);
$outReal = "";
file_put_contents("data/train.binary", $outBinary, FILE_APPEND);
$outBinary = "";
}
$counter += 1;
}
file_put_contents("data/train.real", $outReal, FILE_APPEND);
file_put_contents("data/train.binary", $outBinary, FILE_APPEND);
@kolalait

This comment has been minimized.

Copy link

@kolalait kolalait commented Oct 17, 2014

Hi!
I really appreciate your work. Unfortunately your website http://euphonious-intuition.com is offline. :(
Could you fix it?

@joernroeder

This comment has been minimized.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.