Created
September 1, 2015 14:30
-
-
Save xexu/a2f625166d98800e02f5 to your computer and use it in GitHub Desktop.
A PHP classifier in under 50 lines!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function tokenize($string) | |
{ | |
$string = preg_replace("/ +/", " ", $string); | |
return explode(" ", strtolower($string)); | |
} | |
function train($dataset, $text, $class) | |
{ | |
$tokens=tokenize($text); | |
$dataset['classes'][$class] = (empty($dataset['classes'][$class]) ? 0 : $dataset['classes'][$class]) + 1; | |
foreach ($tokens as $token) { | |
if(empty($dataset['tokens'][$token])){ | |
$dataset['tokens'][$token] = []; | |
} | |
$dataset['tokens'][$token][$class] = (empty($dataset['tokens'][$token][$class]) ? 0 : $dataset['tokens'][$token][$class]) + 1; | |
} | |
return $dataset; | |
} | |
function classify($dataset, $text){ | |
$sum_classes = | |
$tokens = tokenize($text); | |
$probClasses = []; | |
foreach(array_keys($dataset['classes']) as $class){ | |
$probTokens = []; | |
foreach ($tokens as $token) { | |
if(!empty($dataset['tokens'][$token])){ | |
if(empty($dataset['tokens'][$token][$class])){ | |
$probTokens[$token] = 0.000000001; | |
} else { | |
$probTokens[$token] = $dataset['tokens'][$token][$class] / $dataset['classes'][$class]; | |
} | |
} | |
} | |
if(!empty($probTokens)){ | |
$probClasses[$class] = array_reduce(array_values($probTokens), function($a,$b){return $a*$b;},1); | |
} else { | |
$probClasses[$class] = 0; | |
} | |
$probClasses[$class] *= ($dataset['classes'][$class] / array_sum(array_values($dataset['classes']))); | |
} | |
array_multisort(array_keys($probClasses), SORT_DESC, $probClasses); | |
return $probClasses; | |
} | |
$dataset = ['tokens' => [], 'classes' => []]; | |
/* | |
$dataset = train($dataset, 'not to eat too much is not enough to lose weight', 'health'); | |
$dataset = train($dataset, 'Russia try to invade Ukraine', 'politics'); | |
$dataset = train($dataset, 'do not neglect exercise', 'health'); | |
$dataset = train($dataset, 'Syria is the main issue, Obama says', 'politics'); | |
$dataset = train($dataset, 'eat to lose weight', 'health'); | |
$dataset = train($dataset, 'you should not eat much', 'health'); | |
$classification = classify($dataset, "Obama is"); | |
*/ | |
die(var_dump($classification)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment