Last active
August 29, 2015 13:56
-
-
Save tdebatty/8964778 to your computer and use it in GitHub Desktop.
NN-Descent - A PHP implementation of NN-Descent algorithm : "Efficient k-nearest neighbor graph construction for generic similarity measures"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* A quick and dirty PHP implemenation of NNDescent algorithm, as proposed in the paper | |
* Efficient k-nearest neighbor graph construction for generic similarity measures | |
* http://dl.acm.org/citation.cfm?id=1963487 | |
* | |
*/ | |
/** | |
* The algorithm can be used with any kind of item, as long as some | |
* similarity metric is defined | |
* Rmrk : usually, a similarity (0 .. 1) can be computed from a distance using | |
* S = 1 / (1 + d) | |
**/ | |
class Item implements MetricSpaceElement | |
{ | |
public $value = 0; | |
public function __construct($value) { | |
$this->value = $value; | |
} | |
public function Similarity(MetricSpaceElement $other) { | |
return 1 / (1 + abs($this->value - $other->value)); | |
} | |
} | |
$n = 1000; | |
// Create dataset | |
$V = array(); | |
for ($i = 0; $i <$n; $i++) { | |
$V[] = new Item(rand()); | |
} | |
//var_dump($V); | |
$nnd = new NNDescent(); | |
$nnd->K = 3; | |
$nnd->V = $V; | |
$nnd->Run(); | |
//var_dump($nnd->B); | |
$nnd->PrintStats(); | |
class NNDescent | |
{ | |
public $K = 3; | |
public $V = array(); | |
public $B = array(); | |
public $computed_similarities = 0; | |
public $iterations = 0; | |
public $running_time = 0; | |
public function Run() { | |
$start = time(); | |
foreach ($this->V as $i => $v) { | |
$this->B[$i] = $this->sample($this->V, $this->K); | |
} | |
while (1) { | |
$this->iterations++; | |
$R = $this->reverse($this->B); | |
$B2 = array(); | |
foreach ($this->V as $i => $v) { | |
$B2[$i] = array_merge($this->B[$i], $R[$i]); | |
$B2[$i] = $this->kick_doublons($B2[$i]); | |
} | |
$c = 0; | |
foreach ($this->V as $i => $v) { | |
foreach ($B2[$i] as $j => $u1) { | |
foreach ($B2[$j] as $k => $u2) { | |
if ($v !== $u2->item) { | |
$l = $v->Similarity($u2->item); | |
$this->computed_similarities++; | |
$c += $this->updateNN($this->B[$i], $u2->item, $l); | |
} | |
} | |
} | |
} | |
if ($c == 0) { | |
break; | |
} | |
} | |
$this->running_time = time() - $start; | |
} | |
public function PrintStats() { | |
echo "Completion took {$this->iterations} iterations\n"; | |
echo "Computed similarities: {$this->computed_similarities} \n"; | |
echo "Brute force would be: " . (count($this->V) * (count($this->V) - 1) / 2). "\n"; | |
echo "Running time: {$this->running_time} sec\n"; | |
echo "Peak memory usage: " . round(memory_get_peak_usage() / 1024 / 1024) . "MB\n"; | |
} | |
/** | |
* Sample $n elements from array $S | |
**/ | |
protected function sample($S, $n) { | |
$return = array(); | |
while(count($return) < $n) { | |
$i = $S[rand(0, count($S)-1)]; | |
$already_in_return = false; | |
foreach ($return as $OtherGraphElement) { | |
if ($i === $OtherGraphElement->item) { | |
$already_in_return = true; | |
break; | |
} | |
} | |
if (!$already_in_return) { | |
$return[] = new KNNGraphElement($i); | |
} | |
} | |
return $return; | |
} | |
/** | |
* Reverse NN array | |
* $R[v] is the list of elements (u) for which v is a neighbor | |
* (v is in $B[u]) | |
* */ | |
function reverse($B) { | |
$R = array(); | |
// For each Item $v from dataset | |
foreach ($this->V as $i => $v) { | |
$R[$i] = array(); | |
// For each other Item $v2 | |
foreach ($this->V as $j => $v2) { | |
// If $v is a NN of $v2 | |
foreach ($B[$j] as $NeighborOfV2) { | |
if ($v === $NeighborOfV2->item) { | |
$R[$i][] = new KNNGraphElement($v2); | |
} | |
} | |
} | |
} | |
return $R; | |
} | |
// $l is a KNNGraphElement[] | |
function kick_doublons($L) { | |
$return = array(); | |
foreach ($L as $e) { | |
$already_in_return = false; | |
foreach ($return as $other) { | |
if ($other->item === $e->item) { | |
$already_in_return = true; | |
} | |
} | |
if (!$already_in_return) { | |
$return[] = $e; | |
} | |
} | |
return $return; | |
} | |
/** | |
* Update KNN list $K | |
* Try to insert item $u, which has a similarity $l | |
* Return 1 if $u was inserted | |
* 0 otherwise | |
* */ | |
function updateNN(&$K, Item $u, $l) { | |
// Find the element of current NN list that has the smallest similarity | |
// $K is an array of KNNGraphElement | |
$smallest_similarity_i = 0; | |
$smallest_similarity = 1E308; | |
foreach ($K as $i => $GraphElement) { | |
if ($GraphElement->item === $u) { | |
// This item is already in neighborhoods | |
return 0; | |
} | |
if ($GraphElement->similarity < $smallest_similarity) { | |
$smallest_similarity = $GraphElement->similarity; | |
$smallest_similarity_i = $i; | |
} | |
} | |
if ($smallest_similarity < $l) { | |
// Insert $u | |
$K[$smallest_similarity_i] = new KNNGraphElement($u, $l); | |
return 1; | |
} | |
return 0; | |
} | |
} | |
interface MetricSpaceElement | |
{ | |
public function Similarity(MetricSpaceElement $other); | |
} | |
class KNNGraphElement | |
{ | |
public $item; | |
public $similarity; | |
public function __construct(Item $item, $similarity = 0) { | |
$this->item = $item; | |
$this->similarity = $similarity; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* A quick and dirty PHP implemenation of NNDescentFull algorithm, as proposed in the paper | |
* Efficient k-nearest neighbor graph construction for generic similarity measures | |
* http://dl.acm.org/citation.cfm?id=1963487 | |
* | |
*/ | |
class Item implements MetricSpaceElement | |
{ | |
public $value = 0; | |
public function __construct($value) { | |
$this->value = $value; | |
} | |
public function Similarity(MetricSpaceElement $other) { | |
return 1 / (1 + abs($this->value - $other->value)); | |
} | |
} | |
$n = 800; | |
// Create dataset | |
$V = array(); | |
for ($i = 0; $i <$n; $i++) { | |
$V[] = new Item(rand()); | |
} | |
//var_dump($V); | |
$nnd = new NNDescentFull(); | |
$nnd->K = 3; | |
$nnd->rho = 0.5; | |
$nnd->delta = 0.001; | |
$nnd->V = $V; | |
$nnd->Run(); | |
//var_dump($nnd->B); | |
$nnd->PrintStats(); | |
class NNDescentFull | |
{ | |
public $V; | |
public $K = 3; | |
public $rho = 0.5; | |
public $delta = 0.001; | |
/** | |
* Contains KNNGraphElements | |
* */ | |
public $B; | |
public $iterations = 0; | |
public $computed_similarities = 0; | |
public $running_time = 0; | |
public function __construct() { | |
$this->B = new SplObjectStorage(); | |
} | |
public function Run() { | |
$start = time(); | |
foreach ($this->V as $v) { | |
$this->B[$v] = $this->sampleGraphElems($this->V, $this->K); | |
} | |
$old = new SplObjectStorage(); | |
$new = new SplObjectStorage(); | |
while (true) { | |
$this->iterations++; | |
foreach ($this->V as $v) { | |
$old[$v] = $this->pick_falses($this->B[$v]); | |
$new[$v] = $this->pick_trues_and_mark_false($this->B[$v], $this->rho); | |
} | |
$old2 = $this->reverse($old); | |
$new2 = $this->reverse($new); | |
$c = 0; | |
foreach ($this->V as $v) { | |
$old[$v] = $this->union($old[$v], $this->sample($old2[$v], $this->rho * $this->K)); | |
$new[$v] = $this->union($new[$v], $this->sample($new2[$v], $this->rho * $this->K)); | |
foreach ($new[$v] as $u1) { | |
// Process only u2 if u1 < u2 | |
$processing = false; | |
foreach ($new[$v] as $u2) { | |
if ($u2 === $u1) { | |
$processing = true; | |
continue; | |
} | |
if ($processing) { | |
$l = $u1->Similarity($u2); | |
$this->computed_similarities++; | |
$this->B[$u1] = $this->updateNN($this->B[$u1], $u2, $l, $c); | |
$this->B[$u2] = $this->updateNN($this->B[$u2], $u1, $l, $c); | |
} | |
} | |
foreach ($old[$v] as $u2) { | |
if ($u2 === $u1) { | |
continue; | |
} | |
$l = $u1->Similarity($u2); | |
$this->computed_similarities++; | |
$this->B[$u1] = $this->updateNN($this->B[$u1], $u2, $l, $c); | |
$this->B[$u2] = $this->updateNN($this->B[$u2], $u1, $l, $c); | |
} | |
} | |
} | |
if ($c < $this->delta * count($this->V) * $this->K) { | |
break; | |
} | |
} | |
$this->running_time = time() - $start; | |
} | |
public function PrintStats() { | |
echo "Completion took {$this->iterations} iterations\n"; | |
echo "Computed similarities: {$this->computed_similarities} \n"; | |
echo "Brute force would be: " . (count($this->V) * (count($this->V) - 1) / 2). "\n"; | |
echo "Running time: {$this->running_time} sec\n"; | |
} | |
/** | |
* $a is an array of KNNGraphElement | |
* return array of items | |
* */ | |
protected function pick_falses($A) { | |
$return = array(); | |
foreach ($A as $element) { | |
if (!$element->new) { | |
$return[] = $element->item; | |
} | |
} | |
return $return; | |
} | |
/** | |
* pick new items (marked as true) with a probability of rho | |
* mark them as false | |
* $A is an array of KNNGraphElement | |
* return array of items | |
* */ | |
protected function pick_trues_and_mark_false($A, $rho) { | |
$return = array(); | |
foreach ($A as $element) { | |
if ($element->new | |
AND (rand() / getrandmax() <= $rho) ) { | |
$element->new = false; | |
$return[] = $element->item; | |
} | |
} | |
return $return; | |
} | |
/** | |
* $S is an array | |
* | |
* */ | |
protected function sample($S, $n) { | |
if (count($S) <= $n) { | |
return $S; | |
} | |
$return = array(); | |
// should use array_rand() !!! | |
while(count($return) < $n) { | |
$i = $S[rand(0, count($S)-1)]; | |
$already_in_return = false; | |
foreach ($return as $OtherGraphElement) { | |
if ($i === $OtherGraphElement) { | |
$already_in_return = true; | |
break; | |
} | |
} | |
if (!$already_in_return) { | |
$return[] = $i; | |
} | |
} | |
return $return; | |
} | |
protected function sampleGraphElems($S, $n) { | |
$return = array(); | |
// should use array_rand() !!! | |
while(count($return) < $n) { | |
$i = $S[rand(0, count($S)-1)]; | |
$already_in_return = false; | |
foreach ($return as $OtherGraphElement) { | |
if ($i === $OtherGraphElement->item) { | |
$already_in_return = true; | |
break; | |
} | |
} | |
if (!$already_in_return) { | |
$return[] = new KNNGraphElement($i); | |
} | |
} | |
return $return; | |
} | |
/** | |
* Reverse NN array | |
* $R[v] is the list of elements (u) for which v is a neighbor | |
* (v is in $B[u]) | |
* */ | |
protected function reverse($B) { | |
$R = new SplObjectStorage(); | |
// For each Item $v from dataset | |
foreach ($this->V as $v) { | |
$r = array(); | |
// For each other Item $v2 | |
foreach ($this->V as $v2) { | |
// If $v is a NN of $v2 | |
foreach ($B[$v2] as $NeighborOfV2) { | |
if ($v === $NeighborOfV2) { | |
$r[] = $v2; | |
} | |
} | |
} | |
$R[$v] = $r; | |
} | |
return $R; | |
} | |
protected function union($a, $b) { | |
foreach ($b as $new) { | |
foreach($a as $other) { | |
if ($new === $other) { | |
continue; | |
} | |
$a[] = $new; | |
} | |
} | |
return $a; | |
} | |
/** | |
* Update KNN list $K | |
* $K is an array of NNGraphElement | |
* Try to insert item $u, which has a similarity $l | |
* Returns the new array of NNGraphElement | |
* If a modification has occured, $c is incremented by one | |
* */ | |
protected function updateNN($K, Item $u, $l, &$c) { | |
// Find the element of current NN list that has the smallest similarity | |
// $K is an array of KNNGraphElement | |
$smallest_similarity_i = 0; | |
$smallest_similarity = 1E308; | |
foreach ($K as $i => $GraphElement) { | |
if ($GraphElement->item === $u) { | |
// This item is already in neighborhoods | |
return $K; | |
} | |
if ($GraphElement->similarity < $smallest_similarity) { | |
$smallest_similarity = $GraphElement->similarity; | |
$smallest_similarity_i = $i; | |
} | |
} | |
if ($smallest_similarity < $l) { | |
// Insert $u | |
$K[$smallest_similarity_i] = new KNNGraphElement($u, $l); | |
$c++; | |
return $K; | |
} | |
return $K; | |
} | |
} | |
interface MetricSpaceElement | |
{ | |
public function Similarity(MetricSpaceElement $other); | |
} | |
class KNNGraphElement | |
{ | |
public $item; | |
public $similarity; | |
public $new = true; | |
public function __construct(Item $item, $similarity = 0) { | |
$this->item = $item; | |
$this->similarity = $similarity; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment