bwang482/kld.py

## kld.py
#!/usr/bin/python

import re, math, collections
from collections import Counter

def tokenize(_str):
	stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
	tokens = collections.defaultdict(lambda: 0.)
	for m in re.finditer(r"(\w+)", _str, re.UNICODE):
		m = m.group(1).lower()
		if len(m) < 2: continue
		if m in stopwords: continue
		tokens[m] += 1

	return tokens
#end of tokenize

def kldiv(_s, _t):
	if (len(_s) == 0):
		return 1e33

	if (len(_t) == 0):
		return 1e33

	ssum = 0. + sum(_s.values())
	slen = len(_s)

	tsum = 0. + sum(_t.values())
	tlen = len(_t)

	vocabdiff = set(_s.keys()).difference(set(_t.keys()))
	lenvocabdiff = len(vocabdiff)

	""" epsilon """
	epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
	""" gamma """
	gamma = 1 - lenvocabdiff * epsilon
	# print "_s: %s" % _s
	# print "_t: %s" % _t

	""" Check if distribution probabilities sum to 1"""
	sc = sum([v/ssum for v in _s.itervalues()])
	st = sum([v/tsum for v in _t.itervalues()])

	if sc < 9e-6:
		print "Sum P: %e, Sum Q: %e" % (sc, st)
		print "*** ERROR: sc does not sum up to 1. Bailing out .."
		sys.exit(2)
	if st < 9e-6:
		print "Sum P: %e, Sum Q: %e" % (sc, st)
		print "*** ERROR: st does not sum up to 1. Bailing out .."
		sys.exit(2)

	div = 0.
	ps = []
	pt = []
	vocab = Counter(_s)+Counter(_t)
	for t, v in vocab.iteritems():
		if t in _s:
			pts = _s[t] / ssum
		else:
			pts = epsilon
#		ps.append(pts)

		if t in _t:
			ptt = gamma * (_t[t] / tsum)
		else:
			ptt = epsilon
#		pt.append(ptt)

		ckl = (pts - ptt) * math.log(pts / ptt)
		div += ckl

#	return ps, pt
	return div


if __name__ == '__main__':
	d1 = """Many research publications want you to use BibTeX, which better
	organizes the whole process. Suppose for concreteness your source
	file is x.tex. Basically, you create a file x.bib containing the
	bibliography, and run bibtex on that file."""
	d2 = """In this case you must supply both a \left and a \right because the
	delimiter height are made to match whatever is contained between the
	two commands. But, the \left doesn't have to be an actual 'left
	delimiter', that is you can use '\left)' if there were some reason
	to do it."""
#	ps,pt=kldiv(tokenize(d1), tokenize(d2))
#	print sum(ps)
#	print sum(pt)

	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
	#!/usr/bin/python

	import re, math, collections
	from collections import Counter

	def tokenize(_str):
	stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
	tokens = collections.defaultdict(lambda: 0.)
	for m in re.finditer(r"(\w+)", _str, re.UNICODE):
	m = m.group(1).lower()
	if len(m) < 2: continue
	if m in stopwords: continue
	tokens[m] += 1

	return tokens
	#end of tokenize

	def kldiv(_s, _t):
	if (len(_s) == 0):
	return 1e33

	if (len(_t) == 0):
	return 1e33

	ssum = 0. + sum(_s.values())
	slen = len(_s)

	tsum = 0. + sum(_t.values())
	tlen = len(_t)

	vocabdiff = set(_s.keys()).difference(set(_t.keys()))
	lenvocabdiff = len(vocabdiff)

	""" epsilon """
	epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
	""" gamma """
	gamma = 1 - lenvocabdiff * epsilon
	# print "_s: %s" % _s
	# print "_t: %s" % _t

	""" Check if distribution probabilities sum to 1"""
	sc = sum([v/ssum for v in _s.itervalues()])
	st = sum([v/tsum for v in _t.itervalues()])

	if sc < 9e-6:
	print "Sum P: %e, Sum Q: %e" % (sc, st)
	print "*** ERROR: sc does not sum up to 1. Bailing out .."
	sys.exit(2)
	if st < 9e-6:
	print "Sum P: %e, Sum Q: %e" % (sc, st)
	print "*** ERROR: st does not sum up to 1. Bailing out .."
	sys.exit(2)

	div = 0.
	ps = []
	pt = []
	vocab = Counter(_s)+Counter(_t)
	for t, v in vocab.iteritems():
	if t in _s:
	pts = _s[t] / ssum
	else:
	pts = epsilon
	# ps.append(pts)

	if t in _t:
	ptt = gamma * (_t[t] / tsum)
	else:
	ptt = epsilon
	# pt.append(ptt)

	ckl = (pts - ptt) * math.log(pts / ptt)
	div += ckl

	# return ps, pt
	return div


	if __name__ == '__main__':
	d1 = """Many research publications want you to use BibTeX, which better
	organizes the whole process. Suppose for concreteness your source
	file is x.tex. Basically, you create a file x.bib containing the
	bibliography, and run bibtex on that file."""
	d2 = """In this case you must supply both a \left and a \right because the
	delimiter height are made to match whatever is contained between the
	two commands. But, the \left doesn't have to be an actual 'left
	delimiter', that is you can use '\left)' if there were some reason
	to do it."""
	# ps,pt=kldiv(tokenize(d1), tokenize(d2))
	# print sum(ps)
	# print sum(pt)

	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))