Skip to content

Instantly share code, notes, and snippets.

@tinychaos42
Created January 17, 2012 21:24
Show Gist options
  • Save tinychaos42/1628965 to your computer and use it in GitHub Desktop.
Save tinychaos42/1628965 to your computer and use it in GitHub Desktop.
The clustering algorithm
<?php
// no argument, process demo json
if(!isset($argv[1]))
{
$file = file_get_contents('data.json');
}
else
{
$file = file_get_contents($argv[1]);
}
$data = json_decode($file);
$functionWords = array('always', 'user', 'service', 'clear', 'very', 'body', 'common', 'really', 'havent', 'return','but', 'on', 'with', 'as', 'for', 'in', 'up', 'just', 'few', 'a','all','an','another','any','both','each','either','every', 'she', 'he', 'him', 'was', 'of', 'who', 'and', 'to', 'it', 'or', 'out', 'not', 'is', 'one', 'be', 'has', 'if', 'you', 'her','his','its','my','neither','no','other','our','per','some','that','the','their','these','this','those','whatever','whichever','your', '-', '0', '1','2','3','4','5','6','7','8','9');
$documentStore = array();
$wordFrequencyInDs = array();
$documentTitles = array();
// create word bags
echo "Creating word bags...\n";
foreach ($data->articles as $k=>$document)
{
$documentWordbagEntry['words'] = createDocumentWordBag($document);
// filter stop-words
$documentWordbagEntry['words'] = array_diff($documentWordbagEntry['words'], $functionWords);
$documentStore[] = $documentWordbagEntry;
$documentTitles[$k] = stripslashes($document->title);
}
// calculate the relevant index numbers
echo "Calculating index numbers...\n";
foreach ($documentStore as $k=>$document)
{
$termData = array();
foreach ($document['words'] as $term)
{
// don't re-do for the same word again
if (!array_key_exists($term, $termData))
{
// tf-idf
$termData[$term] = termCount($term, $document['words']) * idfCount($term, $documentStore, $wordFrequencyInDs);
}
$documentStore[$k]['indexes'] = $termData;
}
}
// get the top x in each document
echo "Checking top keywords in each document...\n";
foreach ($documentStore as $k=>$document)
{
asort($documentStore[$k]['indexes']);
unset($documentStore[$k]['words']);
$documentStore[$k]['top'] = getTopXTerms($documentStore[$k]['indexes'], 15);
unset($documentStore[$k]['indexes']);
}
// check if there are correlations
echo "Checking correlations...\n";
foreach ($documentStore as $k=>$document)
{
$documentStore[$k]['related'] = array();
foreach ($document['top'] as $word)
{
foreach ($documentStore as $j=>$document2)
{
foreach ($document2['top'] as $word2)
{
if (strstr($word, $word2) && !in_array($j,$documentStore[$k]['related']) && $k!=$j)
{
$documentStore[$k]['related'][] = $j;
}
}
}
}
}
// create the clusters based on the correlations
echo "Creating clusters based on the correlations...\n";
$clusters = array();
foreach ($documentStore as $k=>$document)
{
$inCluster = documentInCluster($k, $clusters);
if ($inCluster===false)
{
$clusterEntry = array($k);
$clusters[] = $clusterEntry;
}
foreach ($document['related'] as $j=>$related)
{
$relatedInCluster = documentInCluster($related, $clusters);
if ($relatedInCluster===false)
{
if($inCluster===false)
{
$clusters[sizeof($clusters)-1][] = $related;
}
else
{
$clusters[$inCluster][] = $related;
}
}
}
}
// Swapping document id-s with titles for readability
echo "Swapping document id-s with titles for readability...\n";
foreach ($clusters as $cid=>$cluster)
{
foreach ($cluster as $id=>$did)
{
$clusters[$cid][$id] = $documentTitles[$did];
}
}
// Output the results
foreach ($clusters as $cid=>$cluster)
{
echo "Cluster ".($cid+1)." contents:\n";
foreach ($cluster as $id=>$title)
{
echo "\t".$title."\n";
}
}
/**
* Check if the document is already in the current cluster set
* @param $id
* @param $clusters
* @return bool|int
*/
function documentInCluster($id, $clusters)
{
foreach ($clusters as $cid=>$cluster)
{
if (in_array($id, $cluster))
{
return $cid;
}
}
return false;
}
/**
* Puts document's title and content field's words into a flat array
*
* @param $document
* @return array
*/
function createDocumentWordBag($document)
{
$result = array_merge( createAttributeWordBag(stripcslashes($document->content)), createAttributeWordBag(stripslashes($document->title)));
return $result;
}
/**
* Removes punctuation and puts words into flat array
* @param $attribute
* @return array
*/
function createAttributeWordBag($attribute)
{
$punctuationPattern = array("+",",",".","-","\"","&","!","?",":",";","#","~","=","/","$","£","^","(",")","_","<",">","\r", "\r\n", "\n", "*", "'");
$text = str_replace($punctuationPattern, ' ', strtolower($attribute));
$result = explode(' ',$text);
foreach ($result as $k=>$res)
{
if($res === '' || strlen($res)<4)
{
unset($result[$k]);
}
}
return $result;
}
/**
* Calculate occurrences of a term in array
*
* @param $term
* @param $textArray
*
* @return array
*/
function termCount($term, $textArray)
{
$occurrences = array_count_values($textArray);
if (isset($occurrences[$term]))
{
// calculate relative frequency (long documents are likely contain proportionally more keywords)
return $occurrences[$term]/sizeof($textArray);
}
else
{
return 0;
}
}
/**
* Calculate the idf score for a term using termInDocumentStore
*
* @param $term
* @param $documentStore
* @param &$wordFrequencyInDs
* @return float
*/
function idfCount($term, $documentStore, &$wordFrequencyInDs)
{
if (!array_key_exists($term, $wordFrequencyInDs))
{
$count = termInDocumentStore($term, $documentStore);
$wordFrequencyInDs[$term] = $count;
}
else
{
$count = $wordFrequencyInDs[$term];
}
return log(abs(sizeof($documentStore)/abs($count)));
}
/**
* Check if the term is in the document store - for the idf calculation
*
* @param $term
* @param $ds
* @return int
*/
function termInDocumentStore($term, $ds)
{
$count = 0;
foreach ($ds as $d)
{
if (in_array($term, $d['words']))
{
$count++;
}
}
return $count;
}
/**
* Get the top x terms after the tf-idf has been calculated
* @param $wordList
* @param $x
* @return array
*/
function getTopXTerms($wordList, $x)
{
$size = sizeof($wordList);
$sliced = array_slice($wordList, $size-$x);
foreach ($sliced as $term=>$value)
{
if($value!="–");
$ret[] = $term;
}
return $ret;
}
?>
@gaffling
Copy link

gaffling commented Jul 6, 2018

Dear tinychaos42, very nice script - could you please provide an example "data.json" File? Thank you very much! Best Regards

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment