uolter/tf_idf.py

## tf_idf.py
# -*- coding: utf-8 -*-
# <codecell>
# term frequency

from math import log

# XXX: Enter in a query term from the corpus variable
# QUERY_TERMS = ['mr.', 'green']


def tf(term, doc, normalized=True):
    """ return the term frequncy given a list of terms and a corpus.
        The normalized value is always between 0.0 and 1.0
    """


    doc = doc.lower().split()

    if normalized:
        return doc.count(term.lower()) / float(len(doc))

    return doc.count(term.lower()) / 1.0


def idf(term, corpus):
    """ returns the inverse document frequency
        given a list of terms and a corpus.
    """

    num_texts_with_term = len([True for text in corpus if term.lower()
                               in text.lower().split()])
    try:
        return 1.0 + log(float(len(corpus)) / num_texts_with_term)
    except ZeroDivisionError:
        return 1.0

def tf_idf(term, doc, corpus):
    """ tf-idf calc involves multiplying against a tf value less than 0, so it is
    necessary to return a value greater than 1 for consistent scoring.
    Multiplying two values less than 1 returns a value less than each of them """

    return tf(term, doc) * idf(term, corpus)
	# -- coding: utf-8 --
	# <codecell>
	# term frequency

	from math import log

	# XXX: Enter in a query term from the corpus variable
	# QUERY_TERMS = ['mr.', 'green']


	def tf(term, doc, normalized=True):
	""" return the term frequncy given a list of terms and a corpus.
	The normalized value is always between 0.0 and 1.0
	"""


	doc = doc.lower().split()

	if normalized:
	return doc.count(term.lower()) / float(len(doc))

	return doc.count(term.lower()) / 1.0


	def idf(term, corpus):
	""" returns the inverse document frequency
	given a list of terms and a corpus.
	"""

	num_texts_with_term = len([True for text in corpus if term.lower()
	in text.lower().split()])
	try:
	return 1.0 + log(float(len(corpus)) / num_texts_with_term)
	except ZeroDivisionError:
	return 1.0

	def tf_idf(term, doc, corpus):
	""" tf-idf calc involves multiplying against a tf value less than 0, so it is
	necessary to return a value greater than 1 for consistent scoring.
	Multiplying two values less than 1 returns a value less than each of them """

	return tf(term, doc) * idf(term, corpus)