Skip to content

Instantly share code, notes, and snippets.

@bwang482
Last active February 20, 2019 17:26
Show Gist options
  • Save bwang482/f3e2fd35a774a61dbedd to your computer and use it in GitHub Desktop.
Save bwang482/f3e2fd35a774a61dbedd to your computer and use it in GitHub Desktop.
Small modification from https://gist.github.com/mrorii/961963
#!/usr/bin/python
import re, math, collections
from collections import Counter
def tokenize(_str):
stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
tokens = collections.defaultdict(lambda: 0.)
for m in re.finditer(r"(\w+)", _str, re.UNICODE):
m = m.group(1).lower()
if len(m) < 2: continue
if m in stopwords: continue
tokens[m] += 1
return tokens
#end of tokenize
def kldiv(_s, _t):
if (len(_s) == 0):
return 1e33
if (len(_t) == 0):
return 1e33
ssum = 0. + sum(_s.values())
slen = len(_s)
tsum = 0. + sum(_t.values())
tlen = len(_t)
vocabdiff = set(_s.keys()).difference(set(_t.keys()))
lenvocabdiff = len(vocabdiff)
""" epsilon """
epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
""" gamma """
gamma = 1 - lenvocabdiff * epsilon
# print "_s: %s" % _s
# print "_t: %s" % _t
""" Check if distribution probabilities sum to 1"""
sc = sum([v/ssum for v in _s.itervalues()])
st = sum([v/tsum for v in _t.itervalues()])
if sc < 9e-6:
print "Sum P: %e, Sum Q: %e" % (sc, st)
print "*** ERROR: sc does not sum up to 1. Bailing out .."
sys.exit(2)
if st < 9e-6:
print "Sum P: %e, Sum Q: %e" % (sc, st)
print "*** ERROR: st does not sum up to 1. Bailing out .."
sys.exit(2)
div = 0.
ps = []
pt = []
vocab = Counter(_s)+Counter(_t)
for t, v in vocab.iteritems():
if t in _s:
pts = _s[t] / ssum
else:
pts = epsilon
# ps.append(pts)
if t in _t:
ptt = gamma * (_t[t] / tsum)
else:
ptt = epsilon
# pt.append(ptt)
ckl = (pts - ptt) * math.log(pts / ptt)
div += ckl
# return ps, pt
return div
if __name__ == '__main__':
d1 = """Many research publications want you to use BibTeX, which better
organizes the whole process. Suppose for concreteness your source
file is x.tex. Basically, you create a file x.bib containing the
bibliography, and run bibtex on that file."""
d2 = """In this case you must supply both a \left and a \right because the
delimiter height are made to match whatever is contained between the
two commands. But, the \left doesn't have to be an actual 'left
delimiter', that is you can use '\left)' if there were some reason
to do it."""
# ps,pt=kldiv(tokenize(d1), tokenize(d2))
# print sum(ps)
# print sum(pt)
print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment