jamak/Naive_Bayesian.py

## Naive_Bayesian.py
from stemming import porter
from operator import mul


class NaiveBayes (object):
  #provide a list of classifiers for this
  def __init__(categories,threshold = 0):

    self.words = dict([])
    self.total_words = 0
    #keeps a dictionary containing documents trained for each category
    self.categories_documents = dict([])
    self.total_documents = 0
    self.threshold = threshold or 0
    print "Threshold is now ",threshold

    # keeps a dictionary of the number of the numer of words in each category
    self.categories_words = dict([])

    for cat in categories:
      self.words(cat) = dict([])
      self.categories_documents[cat] = 0
      self.categories_words[cat] = 0

  # train the document
  def train(category,document):
    """trains the document by taking word count"""
    count = 0
    for word in word_count(document):
      self.words[category][word] = self.words[category][word] or 0
      self.words[category][word] += count
      self.total_words += count
      self.categories_words[category] += count
  def probabilities(document):
    """find the probabilities for each category and return a hash."""
    probabilities = dict([])
    for category in self.words.keys():
      probabilities[category] = probability(category,document)
    return probabilities

  def classify(document,default = 'unknown'):
    """classify the document into one of the categories"""
    p = probabilities(document)
    total = p["noise"] + p["signal"]
    noise = p["noise"]/total
    signal = p["signal"]/total
    if noise > self.threshold:
      return "noise"
    elif signal > self.threshold:
      return signal
    else:
      return default

  def prettify_probabilities(document):
    """prettify the probabilities"""
    probs = probabilities(document)


  def word_probability(category,word):
    """
    the probability of a word being in this category.
    Uses weighted probabilities to avoid zero probability
    """
    return (1 +float(self.words[category][word]))/self.categories_words[category]

  def doc_probability(category,document):
    """
    prob of a doc being in ths category.
    Just the product series of all probabilities in this category
    """
    doc_prob = 1
    word_probs = []
    for k in word_count(document):
      word_probs.append(word_probability(category,word[0]))
    return reduce(mul,word_probs[:-16:-1],doc_prob)# it was suggested that only the 15 most significant be used

  def category_probability(category):
    """the prob that a randomly chosen document is in the category"""
    return self.categories_documents[category]/float(self.total_documents)

  def probability(category,document):
    """The un-normalized probability that document belongs to category"""
    return doc_probability(category,document) * category_probability(category)

  def word_count(document):
    """get a dict of the number of times a word appears in any document"""
    words = re.sub(r'/[^\w\s]/','',document.split(''))
    d = dict([])
    for w in words:
      key = w
      d[key] = d[key] or 0
      d[key] += 1
    return d
	from stemming import porter
	from operator import mul



	class NaiveBayes (object):
	#provide a list of classifiers for this
	def __init__(categories,threshold = 0):

	self.words = dict([])
	self.total_words = 0
	#keeps a dictionary containing documents trained for each category
	self.categories_documents = dict([])
	self.total_documents = 0
	self.threshold = threshold or 0
	print "Threshold is now ",threshold

	# keeps a dictionary of the number of the numer of words in each category
	self.categories_words = dict([])

	for cat in categories:
	self.words(cat) = dict([])
	self.categories_documents[cat] = 0
	self.categories_words[cat] = 0

	# train the document
	def train(category,document):
	"""trains the document by taking word count"""
	count = 0
	for word in word_count(document):
	self.words[category][word] = self.words[category][word] or 0
	self.words[category][word] += count
	self.total_words += count
	self.categories_words[category] += count
	def probabilities(document):
	"""find the probabilities for each category and return a hash."""
	probabilities = dict([])
	for category in self.words.keys():
	probabilities[category] = probability(category,document)
	return probabilities

	def classify(document,default = 'unknown'):
	"""classify the document into one of the categories"""
	p = probabilities(document)
	total = p["noise"] + p["signal"]
	noise = p["noise"]/total
	signal = p["signal"]/total
	if noise > self.threshold:
	return "noise"
	elif signal > self.threshold:
	return signal
	else:
	return default

	def prettify_probabilities(document):
	"""prettify the probabilities"""
	probs = probabilities(document)


	def word_probability(category,word):
	"""
	the probability of a word being in this category.
	Uses weighted probabilities to avoid zero probability
	"""
	return (1 +float(self.words[category][word]))/self.categories_words[category]

	def doc_probability(category,document):
	"""
	prob of a doc being in ths category.
	Just the product series of all probabilities in this category
	"""
	doc_prob = 1
	word_probs = []
	for k in word_count(document):
	word_probs.append(word_probability(category,word[0]))
	return reduce(mul,word_probs[:-16:-1],doc_prob)# it was suggested that only the 15 most significant be used

	def category_probability(category):
	"""the prob that a randomly chosen document is in the category"""
	return self.categories_documents[category]/float(self.total_documents)

	def probability(category,document):
	"""The un-normalized probability that document belongs to category"""
	return doc_probability(category,document) * category_probability(category)

	def word_count(document):
	"""get a dict of the number of times a word appears in any document"""
	words = re.sub(r'/[^\w\s]/','',document.split(''))
	d = dict([])
	for w in words:
	key = w
	d[key] = d[key] or 0
	d[key] += 1
	return d