tinychaos42/cluster.php

## cluster.php
<?php
// no argument, process demo json
if(!isset($argv[1]))
{
	$file = file_get_contents('data.json');
}
else
{
    $file = file_get_contents($argv[1]);
}
$data = json_decode($file);

$functionWords = array('always', 'user', 'service', 'clear', 'very', 'body', 'common', 'really', 'havent', 'return','but', 'on', 'with', 'as', 'for', 'in', 'up', 'just', 'few', 'a','all','an','another','any','both','each','either','every', 'she', 'he', 'him', 'was', 'of', 'who', 'and', 'to', 'it', 'or', 'out', 'not', 'is', 'one', 'be', 'has', 'if', 'you', 'her','his','its','my','neither','no','other','our','per','some','that','the','their','these','this','those','whatever','whichever','your', '-', '0', '1','2','3','4','5','6','7','8','9');

$documentStore = array();
$wordFrequencyInDs = array();
$documentTitles = array();

// create word bags
echo "Creating word bags...\n";
foreach ($data->articles as $k=>$document)
{
    $documentWordbagEntry['words'] = createDocumentWordBag($document);
    // filter stop-words
    $documentWordbagEntry['words'] = array_diff($documentWordbagEntry['words'], $functionWords);
    $documentStore[] = $documentWordbagEntry;
    $documentTitles[$k] = stripslashes($document->title);
}

// calculate the relevant index numbers
echo "Calculating index numbers...\n";
foreach ($documentStore as $k=>$document)
{
    $termData = array();
    foreach ($document['words'] as $term)
    {
        // don't re-do for the same word again
        if (!array_key_exists($term, $termData))
        {
            // tf-idf
            $termData[$term] = termCount($term, $document['words']) * idfCount($term, $documentStore, $wordFrequencyInDs);
        }
        $documentStore[$k]['indexes'] = $termData;
    }
}

// get the top x in each document
echo "Checking top keywords in each document...\n";
foreach ($documentStore as $k=>$document)
{
    asort($documentStore[$k]['indexes']);
    unset($documentStore[$k]['words']);
    $documentStore[$k]['top'] = getTopXTerms($documentStore[$k]['indexes'], 15);
    unset($documentStore[$k]['indexes']);
}

// check if there are correlations
echo "Checking correlations...\n";
foreach ($documentStore as $k=>$document)
{
    $documentStore[$k]['related'] = array();
    foreach ($document['top'] as $word)
    {
        foreach ($documentStore as $j=>$document2)
        {
            foreach ($document2['top'] as $word2)
            {
                if (strstr($word, $word2) && !in_array($j,$documentStore[$k]['related']) && $k!=$j)
                {
                    $documentStore[$k]['related'][] = $j;
                }
            }
        }
    }
}
// create the clusters based on the correlations
echo "Creating clusters based on the correlations...\n";
$clusters = array();
foreach ($documentStore as $k=>$document)
{
    $inCluster = documentInCluster($k, $clusters);
    if ($inCluster===false)
    {
        $clusterEntry = array($k);
        $clusters[] = $clusterEntry;
    }

    foreach ($document['related'] as $j=>$related)
    {
        $relatedInCluster = documentInCluster($related, $clusters);
        if ($relatedInCluster===false)
        {
            if($inCluster===false)
            {
                $clusters[sizeof($clusters)-1][] = $related;
            }
            else
            {
                $clusters[$inCluster][] = $related;
            }
        }
    }
}
// Swapping document id-s with titles for readability
echo "Swapping document id-s with titles for readability...\n";
foreach ($clusters as $cid=>$cluster)
{
    foreach ($cluster as $id=>$did)
    {
        $clusters[$cid][$id] = $documentTitles[$did];
    }
}

// Output the results
foreach ($clusters as $cid=>$cluster)
{
    echo "Cluster ".($cid+1)." contents:\n";
    foreach ($cluster as $id=>$title)
    {
        echo "\t".$title."\n";
    }
}

/**
 * Check if the document is already in the current cluster set
 * @param $id
 * @param $clusters
 * @return bool|int
 */
function documentInCluster($id, $clusters)
{

    foreach ($clusters as $cid=>$cluster)
    {
        if (in_array($id, $cluster))
        {
            return $cid;
        }
    }
    return false;
}

/**
 * Puts document's title and content field's words into a flat array
 *
 * @param $document
 * @return array
 */
function createDocumentWordBag($document)
{
    $result = array_merge( createAttributeWordBag(stripcslashes($document->content)), createAttributeWordBag(stripslashes($document->title)));
    return $result;
}

/**
 * Removes punctuation and puts words into flat array
 * @param $attribute
 * @return array
 */
function createAttributeWordBag($attribute)
{
    $punctuationPattern = array("+",",",".","-","\"","&","!","?",":",";","#","~","=","/","$","£","^","(",")","_","<",">","\r", "\r\n", "\n", "*", "'");
    $text = str_replace($punctuationPattern, ' ', strtolower($attribute));
    $result = explode(' ',$text);
    foreach ($result as $k=>$res)
    {
        if($res === '' || strlen($res)<4)
        {
            unset($result[$k]);
        }
    }
    return $result;
}

/**
 * Calculate occurrences of a term in array
 *
 * @param $term
 * @param $textArray
 *
 * @return array
 */
function termCount($term, $textArray)
{
    $occurrences = array_count_values($textArray);
    if (isset($occurrences[$term]))
    {
        // calculate relative frequency (long documents are likely contain proportionally more keywords)
        return $occurrences[$term]/sizeof($textArray);
    }
    else
    {
        return 0;
    }
}

/**
 * Calculate the idf score for a term using termInDocumentStore
 *
 * @param $term
 * @param $documentStore
 * @param &$wordFrequencyInDs
 * @return float
 */
function idfCount($term, $documentStore, &$wordFrequencyInDs)
{
    if (!array_key_exists($term, $wordFrequencyInDs))
    {
        $count = termInDocumentStore($term, $documentStore);
        $wordFrequencyInDs[$term] = $count;
    }
    else
    {
        $count = $wordFrequencyInDs[$term];
    }
    return log(abs(sizeof($documentStore)/abs($count)));
}

/**
 * Check if the term is in the document store - for the idf calculation
 *
 * @param $term
 * @param $ds
 * @return int
 */
function termInDocumentStore($term, $ds)
{
    $count = 0;
    foreach ($ds as $d)
    {
        if (in_array($term, $d['words']))
        {
            $count++;
        }
    }
    return $count;
}

/**
 * Get the top x terms after the tf-idf has been calculated
 * @param $wordList
 * @param $x
 * @return array
 */
function getTopXTerms($wordList, $x)
{
    $size = sizeof($wordList);
    $sliced = array_slice($wordList, $size-$x);
    foreach ($sliced as $term=>$value)
    {
        if($value!="–");
        $ret[] = $term;
    }
    return $ret;
}


?>
	<?php
	// no argument, process demo json
	if(!isset($argv[1]))
	{
	$file = file_get_contents('data.json');
	}
	else
	{
	$file = file_get_contents($argv[1]);
	}
	$data = json_decode($file);

	$functionWords = array('always', 'user', 'service', 'clear', 'very', 'body', 'common', 'really', 'havent', 'return','but', 'on', 'with', 'as', 'for', 'in', 'up', 'just', 'few', 'a','all','an','another','any','both','each','either','every', 'she', 'he', 'him', 'was', 'of', 'who', 'and', 'to', 'it', 'or', 'out', 'not', 'is', 'one', 'be', 'has', 'if', 'you', 'her','his','its','my','neither','no','other','our','per','some','that','the','their','these','this','those','whatever','whichever','your', '-', '0', '1','2','3','4','5','6','7','8','9');

	$documentStore = array();
	$wordFrequencyInDs = array();
	$documentTitles = array();

	// create word bags
	echo "Creating word bags...\n";
	foreach ($data->articles as $k=>$document)
	{
	$documentWordbagEntry['words'] = createDocumentWordBag($document);
	// filter stop-words
	$documentWordbagEntry['words'] = array_diff($documentWordbagEntry['words'], $functionWords);
	$documentStore[] = $documentWordbagEntry;
	$documentTitles[$k] = stripslashes($document->title);
	}

	// calculate the relevant index numbers
	echo "Calculating index numbers...\n";
	foreach ($documentStore as $k=>$document)
	{
	$termData = array();
	foreach ($document['words'] as $term)
	{
	// don't re-do for the same word again
	if (!array_key_exists($term, $termData))
	{
	// tf-idf
	$termData[$term] = termCount($term, $document['words']) * idfCount($term, $documentStore, $wordFrequencyInDs);
	}
	$documentStore[$k]['indexes'] = $termData;
	}
	}

	// get the top x in each document
	echo "Checking top keywords in each document...\n";
	foreach ($documentStore as $k=>$document)
	{
	asort($documentStore[$k]['indexes']);
	unset($documentStore[$k]['words']);
	$documentStore[$k]['top'] = getTopXTerms($documentStore[$k]['indexes'], 15);
	unset($documentStore[$k]['indexes']);
	}

	// check if there are correlations
	echo "Checking correlations...\n";
	foreach ($documentStore as $k=>$document)
	{
	$documentStore[$k]['related'] = array();
	foreach ($document['top'] as $word)
	{
	foreach ($documentStore as $j=>$document2)
	{
	foreach ($document2['top'] as $word2)
	{
	if (strstr($word, $word2) && !in_array($j,$documentStore[$k]['related']) && $k!=$j)
	{
	$documentStore[$k]['related'][] = $j;
	}
	}
	}
	}
	}
	// create the clusters based on the correlations
	echo "Creating clusters based on the correlations...\n";
	$clusters = array();
	foreach ($documentStore as $k=>$document)
	{
	$inCluster = documentInCluster($k, $clusters);
	if ($inCluster===false)
	{
	$clusterEntry = array($k);
	$clusters[] = $clusterEntry;
	}

	foreach ($document['related'] as $j=>$related)
	{
	$relatedInCluster = documentInCluster($related, $clusters);
	if ($relatedInCluster===false)
	{
	if($inCluster===false)
	{
	$clusters[sizeof($clusters)-1][] = $related;
	}
	else
	{
	$clusters[$inCluster][] = $related;
	}
	}
	}
	}
	// Swapping document id-s with titles for readability
	echo "Swapping document id-s with titles for readability...\n";
	foreach ($clusters as $cid=>$cluster)
	{
	foreach ($cluster as $id=>$did)
	{
	$clusters[$cid][$id] = $documentTitles[$did];
	}
	}

	// Output the results
	foreach ($clusters as $cid=>$cluster)
	{
	echo "Cluster ".($cid+1)." contents:\n";
	foreach ($cluster as $id=>$title)
	{
	echo "\t".$title."\n";
	}
	}

	/**
	* Check if the document is already in the current cluster set
	* @param $id
	* @param $clusters
	* @return bool\|int
	*/
	function documentInCluster($id, $clusters)
	{

	foreach ($clusters as $cid=>$cluster)
	{
	if (in_array($id, $cluster))
	{
	return $cid;
	}
	}
	return false;
	}

	/**
	* Puts document's title and content field's words into a flat array
	*
	* @param $document
	* @return array
	*/
	function createDocumentWordBag($document)
	{
	$result = array_merge( createAttributeWordBag(stripcslashes($document->content)), createAttributeWordBag(stripslashes($document->title)));
	return $result;
	}

	/**
	* Removes punctuation and puts words into flat array
	* @param $attribute
	* @return array
	*/
	function createAttributeWordBag($attribute)
	{
	$punctuationPattern = array("+",",",".","-","\"","&","!","?",":",";","#","~","=","/","$","£","^","(",")","_","<",">","\r", "\r\n", "\n", "*", "'");
	$text = str_replace($punctuationPattern, ' ', strtolower($attribute));
	$result = explode(' ',$text);
	foreach ($result as $k=>$res)
	{
	if($res === '' \|\| strlen($res)<4)
	{
	unset($result[$k]);
	}
	}
	return $result;
	}

	/**
	* Calculate occurrences of a term in array
	*
	* @param $term
	* @param $textArray
	*
	* @return array
	*/
	function termCount($term, $textArray)
	{
	$occurrences = array_count_values($textArray);
	if (isset($occurrences[$term]))
	{
	// calculate relative frequency (long documents are likely contain proportionally more keywords)
	return $occurrences[$term]/sizeof($textArray);
	}
	else
	{
	return 0;
	}
	}

	/**
	* Calculate the idf score for a term using termInDocumentStore
	*
	* @param $term
	* @param $documentStore
	* @param &$wordFrequencyInDs
	* @return float
	*/
	function idfCount($term, $documentStore, &$wordFrequencyInDs)
	{
	if (!array_key_exists($term, $wordFrequencyInDs))
	{
	$count = termInDocumentStore($term, $documentStore);
	$wordFrequencyInDs[$term] = $count;
	}
	else
	{
	$count = $wordFrequencyInDs[$term];
	}
	return log(abs(sizeof($documentStore)/abs($count)));
	}

	/**
	* Check if the term is in the document store - for the idf calculation
	*
	* @param $term
	* @param $ds
	* @return int
	*/
	function termInDocumentStore($term, $ds)
	{
	$count = 0;
	foreach ($ds as $d)
	{
	if (in_array($term, $d['words']))
	{
	$count++;
	}
	}
	return $count;
	}

	/**
	* Get the top x terms after the tf-idf has been calculated
	* @param $wordList
	* @param $x
	* @return array
	*/
	function getTopXTerms($wordList, $x)
	{
	$size = sizeof($wordList);
	$sliced = array_slice($wordList, $size-$x);
	foreach ($sliced as $term=>$value)
	{
	if($value!="–");
	$ret[] = $term;
	}
	return $ret;
	}


	?>