Skip to content

Instantly share code, notes, and snippets.

@xexu
Created September 1, 2015 14:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xexu/a2f625166d98800e02f5 to your computer and use it in GitHub Desktop.
Save xexu/a2f625166d98800e02f5 to your computer and use it in GitHub Desktop.
A PHP classifier in under 50 lines!
<?php
function tokenize($string)
{
$string = preg_replace("/ +/", " ", $string);
return explode(" ", strtolower($string));
}
function train($dataset, $text, $class)
{
$tokens=tokenize($text);
$dataset['classes'][$class] = (empty($dataset['classes'][$class]) ? 0 : $dataset['classes'][$class]) + 1;
foreach ($tokens as $token) {
if(empty($dataset['tokens'][$token])){
$dataset['tokens'][$token] = [];
}
$dataset['tokens'][$token][$class] = (empty($dataset['tokens'][$token][$class]) ? 0 : $dataset['tokens'][$token][$class]) + 1;
}
return $dataset;
}
function classify($dataset, $text){
$sum_classes =
$tokens = tokenize($text);
$probClasses = [];
foreach(array_keys($dataset['classes']) as $class){
$probTokens = [];
foreach ($tokens as $token) {
if(!empty($dataset['tokens'][$token])){
if(empty($dataset['tokens'][$token][$class])){
$probTokens[$token] = 0.000000001;
} else {
$probTokens[$token] = $dataset['tokens'][$token][$class] / $dataset['classes'][$class];
}
}
}
if(!empty($probTokens)){
$probClasses[$class] = array_reduce(array_values($probTokens), function($a,$b){return $a*$b;},1);
} else {
$probClasses[$class] = 0;
}
$probClasses[$class] *= ($dataset['classes'][$class] / array_sum(array_values($dataset['classes'])));
}
array_multisort(array_keys($probClasses), SORT_DESC, $probClasses);
return $probClasses;
}
$dataset = ['tokens' => [], 'classes' => []];
/*
$dataset = train($dataset, 'not to eat too much is not enough to lose weight', 'health');
$dataset = train($dataset, 'Russia try to invade Ukraine', 'politics');
$dataset = train($dataset, 'do not neglect exercise', 'health');
$dataset = train($dataset, 'Syria is the main issue, Obama says', 'politics');
$dataset = train($dataset, 'eat to lose weight', 'health');
$dataset = train($dataset, 'you should not eat much', 'health');
$classification = classify($dataset, "Obama is");
*/
die(var_dump($classification));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment