Skip to content

Instantly share code, notes, and snippets.

@emsifa
Last active August 29, 2015 14:10
Show Gist options
  • Save emsifa/6f82052c5004a8d34712 to your computer and use it in GitHub Desktop.
Save emsifa/6f82052c5004a8d34712 to your computer and use it in GitHub Desktop.
Naive Bayes Text Classifier
<?php
require("NaiveBayesText.php");
$opinionClassifier = new NaiveBayesText();
$opinionClassifier->wordResolver(function($word) {
// disini gw mainin ignored words doang.. bagusnya ada tokeniing, stemming kata, dsb..
$ignored_words = array("dia", "orang", "itu", "sangat", "kalian", "tidak", "dll", "dsb");
return (in_array($word, $ignored_words))? null : $word;
});
// semakin banyak training semakin akurat...
$opinionClassifier->addTraining("negative", "orang itu sangat jelek");
$opinionClassifier->addTraining("negative", "mereka semua malas");
$opinionClassifier->addTraining("negative", "asu luh");
$opinionClassifier->addTraining("negative", "kalian bodoh");
$opinionClassifier->addTraining("negative", "asu, merusak pemandangan aja");
$opinionClassifier->addTraining("positive", "orang itu sangat baik");
$opinionClassifier->addTraining("positive", "dia hebat");
$opinionClassifier->addTraining("positive", "mereka orang yang baik");
$opinionClassifier->addTraining("positive", "belajarlah yang benar");
$opinionClassifier->addTraining("positive", "mereka sangat baik");
$test_kalimat = "ah asu lah, nggak guna banget";
print_r($opinionClassifier->classify($test_kalimat)); // hasil: array('negative' => 1, 'positive' => 0)
print_r($opinionClassifier->is("negative", $test_kalimat)); // hasil: true
print_r($opinionClassifier->isNegative($test_kalimat)); // hasil: true
<?php
class NaiveBayesText {
protected $training_sets = array();
protected $word_resolver = null;
protected $probability_cache = array();
public function wordResolver($resolver_callable)
{
if(!is_callable($resolver_callable)) {
throw new InvalidArgumentExeption("Word Resolver must be callable");
}
$this->word_resolver = $resolver_callable;
}
public function addTraining($classify, $document)
{
$classify = strtolower($classify);
if(!array_key_exists($classify, $this->training_sets)) {
$this->training_sets[$classify] = array();
}
$this->training_sets[$classify][] = $document;
}
public function getWordProbability($word)
{
$word = $this->resolveWord($word);
if(empty($word)) {
return null;
}
if(array_key_exists($word, $this->probability_cache)) {
return $this->probability_cache[$word];
}
$total = array();
$count_datasets = array();
foreach($this->training_sets as $classify => $datasets) {
$total[$classify] = 0;
$count_datasets[$classify] = count($datasets);
foreach($datasets as $training) {
$train_words = $this->parseWords($training);
foreach($train_words as $t_word) {
$t_word = $this->resolveWord($t_word);
if(empty($t_word)) continue;
if($t_word == $word) {
$total[$classify] += 1;
break;
}
}
}
}
$result = array();
foreach($total as $classify => $count) {
$result[$classify] = array(
'count_match' => $count,
'count_datasets' => $count_datasets[$classify],
);
}
$this->probability_cache[$word] = $result;
return $result;
}
public function getClassifiers()
{
return array_keys($this->training_sets);
}
public function classify($document)
{
$words = $this->parseWords($document);
$classifiers = $this->getClassifiers();
$scores = array();
$count_datasets = array();
$total_datasets = 0;
foreach($classifiers as $classify) {
$count_datasets[$classify] = count($this->training_sets[$classify]);
$total_datasets += $count_datasets[$classify];
$scores[$classify] = doubleval(0);
}
foreach($words as $word) {
$word_probability = $this->getWordProbability($word);
if(!$word_probability) continue;
foreach($classifiers as $classify) {
$_count_match = $word_probability[$classify]['count_match'];
$_count_datasets = $word_probability[$classify]['count_datasets'];
$probability = $_count_match/$_count_datasets;
if($probability > 0) {
if($scores[$classify] == 0) $scores[$classify] = 1;
$scores[$classify] *= $probability;
}
}
}
foreach($classifiers as $classify) {
$scores[$classify] *= $count_datasets[$classify]/$total_datasets;
}
return $this->normalizeResult($scores);
}
protected function normalizeResult(array $scores)
{
$sum = 0;
foreach($scores as $score) $sum += $score;
foreach($scores as $i => $score) {
if($sum == 0 || $score == 0) {
$scores[$i] = 0;
} else {
$scores[$i] = $score/$sum;
}
}
return $scores;
}
public function is($classify, $document)
{
if(!array_key_exists($classify, $this->training_sets)) {
throw new Exception("Undefined classify {$classify}");
}
$classify_result = $this->classify($document);
$classify_score = $classify_result[$classify];
unset($classify_result[$classify]);
foreach($classify_result as $score) {
if($score > $classify_score) return FALSE;
}
return TRUE;
}
protected function parseWords($text)
{
return explode(" ", $text);
}
protected function basicTokenize($word)
{
if(!is_string($word)) return "";
return trim(preg_replace("/^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$/","", $word));
}
protected function resolveWord($word)
{
if(!$this->word_resolver) return $this->basicTokenize($word);
return $this->basicTokenize(call_user_func($this->word_resolver, $word));
}
public function __call($method, $args)
{
if(!preg_match("/^is[A-Z]/", $method)) {
throw new Exception("Call to undefined method {$method}");
}
$classify = strtolower(preg_replace("/^is/", "", $method));
return $this->is($classify, $args[0]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment