Skip to content

Instantly share code, notes, and snippets.

@xaptronic
Forked from andreiz/classifier.php
Last active December 14, 2015 09:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xaptronic/5066263 to your computer and use it in GitHub Desktop.
Save xaptronic/5066263 to your computer and use it in GitHub Desktop.
<?php
error_reporting(E_ALL);
define('NUM_FEATURES', 3);
// My dataset describes cities around the world where I might consider living.
// Each sample (city) consists of 3 features:
// * Feature 1: average low winter temperature in the city
// * Feature 2: city population, in millions
// * Feature 3: does the city have an airport I can fly to from USA directly?
//
// The labels (categories) are 1 (yes) and 0 (no).
// All the data is floating-point.
$training = array(
array(-11., 2.6, 1.),
array( 8., 0.78, 1.),
array( 15., 4.2, 0.),
array(-16., 0.18, 0.),
array( 3., 1.1, 0.),
array( 7., 1.4, 1.),
array( -3., 1.44, 1.),
array( -7., 0.52, 0.),
array( 30., 0.82, 1.),
array( 20., 1.32, 0.),
);
$labels = array(
0.,
1.,
0.,
0.,
1.,
1.,
1.,
0.,
0.,
1
);
$NUM_SAMPLES = sizeof($training);
// Initialize the weights array to random starting values.
// There are always 1+NUM_FEATURES weights, because the first weight
// does not correspond to a feature value, since:
// weights * features = weight0 + weight1 * feature1 + weight2 * feature2 + ...
$weights = array();
for ($j=0; $j < NUM_FEATURES+1; $j++)
$weights[$j] = mt_rand()/mt_getrandmax()*5.0;
$learning_rate = 0.05;
$steps = 20000; // number of steps to take for gradient descent
$temp = array(); // temp array to hold updates for weights during the loop
for ($n = 0; $n < $steps; $n++) {
// For each weight, perform the gradient descent step and save the result to temp
for ($j = 0; $j < NUM_FEATURES+1; $j++) {
$sum_m = 0.0;
for ($i = 0; $i < $NUM_SAMPLES; $i++) {
$h = hypothesis($training[$i], $weights);
// The first weight has a dummy 1 "feature" value
$part = ($h - $labels[$i]) * ($j==0 ? 1.0 : $training[$i][$j-1]);
$sum_m = $sum_m + $part;
}
$temp[$j] = $weights[$j] - $learning_rate * $sum_m/$NUM_SAMPLES;
}
$weights = $temp;
}
echo "Executed $n steps\n";
echo "Weights: ", vector_to_str($weights), "\n";
// Validate the results
print "\nValidating training\n";
$correct = 0;
for ($i = 0; $i < $NUM_SAMPLES; $i++) {
$predict = predict($training[$i], $weights);
printf("Input: %-16s actual: %d, predict: %d", vector_to_str($training[$i]), $labels[$i], $predict);
if ($labels[$i] != $predict)
print " - miss";
print "\n";
if ($predict == $labels[$i])
$correct++;
}
printf("Correctness = %.0f%%\n", $correct/$NUM_SAMPLES*100.0);
// Try some predictions
print "\nTesting the model\n";
$test = array(
array(-1., 1.1, 1.),
array(23., 0.9, 0.),
array( 4., 1.9, 0.),
array(-14., 1.1, 1.),
);
for ($i = 0; $i < sizeof($test); $i++) {
$predict = predict($test[$i], $weights);
printf("Input: %-16s predict: %d\n", vector_to_str($test[$i]), $predict);
}
function hypothesis($x, $weights)
{
$score = $weights[0]; // free weight
$k = sizeof($x);
// Calculate dot product
for ($i = 0; $i < $k; $i++)
$score += $weights[$i+1] * $x[$i];
// Run through the sigmoid (logistic) function
return 1.0/(1.0 + exp(-$score));
}
function predict($input, $weights)
{
$output = hypothesis($input, $weights);
// Threshold on 0.5
if ($output >= 0.50)
$predict = 1;
else
$predict = 0;
return $predict;
}
function vector_to_str($x)
{
return '['.implode(", ", $x).']';
}
?>
#!/usr/bin/perl
use strict;
use warnings;
use constant NUM_FEATURES => 3;
use constant STEPS => 20000;
#use constant STEPS => 4;
use constant LEARNING_RATE => 0.05;
use Data::Dumper;
# prototypes
sub hypothesis($$);
sub predict($$);
# Average low winter temperature, city population in millions, does the city have an airport I can fly to from the usa
my $training = [
[ -11., 2.6, 1. ],
[ 8., 0.78, 1. ],
[ 15., 4.2, 0. ],
[ -16., 0.18, 0. ],
[ 3., 1.1, 0. ],
[ 7., 1.4, 1. ],
[ -3., 1.44, 1. ],
[ -7., 0.52, 0. ],
[ 30., 0.82, 1. ],
[ 20., 1.32, 0. ],
];
# the answer to the above data
my $labels = [
0., 1., 0., 0., 1., 1., 1., 0., 0., 1.,
];
# ten samples
my $NUM_SAMPLES = @$training;
# generate one random weight per feature
my $weights = [];
until (@$weights >= NUM_FEATURES+1) {
my $rand = rand;
push @$weights, $rand * 6;
}
my $new_weights = [];
# iterate through gradient descent
my $n;
for ($n = 0; $n < STEPS; $n++) {
# go through each feature (temp, population, and then whether it has the airport
# compute a partial derivative and sum it up to the total slope and then perform gradient descent
for (my $j = 0; $j < NUM_FEATURES + 1; $j++) {
my $sum_m = 0.0; # slope
for (my $i = 0; $i < $NUM_SAMPLES; $i++) {
# compute logistic regression based on these
my $h = hypothesis($training->[$i], $weights);
# slope in which i believe is the cost function where it compares it to reality (ie the label)
$sum_m += ($h - $labels->[$i]) * ($j == 0 ? 1.0 : $training->[$i]->[$j-1]);
}
$new_weights->[$j] = $weights->[$j] - LEARNING_RATE * $sum_m / $NUM_SAMPLES;
}
@$weights = @$new_weights;
}
print "Executed $n steps\n";
print "Weights: ", join(" / ", @$weights). "\n";
print "\nValidating training\n";
my $correct = 0;
for (my $i = 0; $i < $NUM_SAMPLES; $i++) {
my $prediction = predict($training->[$i], $weights);
printf("Input %-16s actual: %d, predict %d", join(",", @{$training->[$i]}), $labels->[$i], $prediction);
if ($labels->[$i] != $prediction) {
print " - miss";
} else {
$correct++;
}
print "\n";
}
sub hypothesis($$) {
my $x = shift;
my $weights = shift;
my $score = $weights->[0]; # free weight (free or bias weight? wtf is that)
# compute dot product
for (my $i = 0; $i < @$x; $i++) {
$score += $weights->[$i+1] * $x->[$i];
}
my $sigmoid = 1.0 / (1.0 + exp(-$score));
# logistic regression sigmoid function
return $sigmoid;
}
sub predict($$) {
my $x = shift;
my $weights = shift;
my $result = hypothesis($x, $weights);
$result >= 0.50 ? 1 : 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment