Last active
February 20, 2019 17:26
-
-
Save bwang482/f3e2fd35a774a61dbedd to your computer and use it in GitHub Desktop.
Small modification from https://gist.github.com/mrorii/961963
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re, math, collections | |
from collections import Counter | |
def tokenize(_str): | |
stopwords = ['and', 'for', 'if', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that'] | |
tokens = collections.defaultdict(lambda: 0.) | |
for m in re.finditer(r"(\w+)", _str, re.UNICODE): | |
m = m.group(1).lower() | |
if len(m) < 2: continue | |
if m in stopwords: continue | |
tokens[m] += 1 | |
return tokens | |
#end of tokenize | |
def kldiv(_s, _t): | |
if (len(_s) == 0): | |
return 1e33 | |
if (len(_t) == 0): | |
return 1e33 | |
ssum = 0. + sum(_s.values()) | |
slen = len(_s) | |
tsum = 0. + sum(_t.values()) | |
tlen = len(_t) | |
vocabdiff = set(_s.keys()).difference(set(_t.keys())) | |
lenvocabdiff = len(vocabdiff) | |
""" epsilon """ | |
epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001 | |
""" gamma """ | |
gamma = 1 - lenvocabdiff * epsilon | |
# print "_s: %s" % _s | |
# print "_t: %s" % _t | |
""" Check if distribution probabilities sum to 1""" | |
sc = sum([v/ssum for v in _s.itervalues()]) | |
st = sum([v/tsum for v in _t.itervalues()]) | |
if sc < 9e-6: | |
print "Sum P: %e, Sum Q: %e" % (sc, st) | |
print "*** ERROR: sc does not sum up to 1. Bailing out .." | |
sys.exit(2) | |
if st < 9e-6: | |
print "Sum P: %e, Sum Q: %e" % (sc, st) | |
print "*** ERROR: st does not sum up to 1. Bailing out .." | |
sys.exit(2) | |
div = 0. | |
ps = [] | |
pt = [] | |
vocab = Counter(_s)+Counter(_t) | |
for t, v in vocab.iteritems(): | |
if t in _s: | |
pts = _s[t] / ssum | |
else: | |
pts = epsilon | |
# ps.append(pts) | |
if t in _t: | |
ptt = gamma * (_t[t] / tsum) | |
else: | |
ptt = epsilon | |
# pt.append(ptt) | |
ckl = (pts - ptt) * math.log(pts / ptt) | |
div += ckl | |
# return ps, pt | |
return div | |
if __name__ == '__main__': | |
d1 = """Many research publications want you to use BibTeX, which better | |
organizes the whole process. Suppose for concreteness your source | |
file is x.tex. Basically, you create a file x.bib containing the | |
bibliography, and run bibtex on that file.""" | |
d2 = """In this case you must supply both a \left and a \right because the | |
delimiter height are made to match whatever is contained between the | |
two commands. But, the \left doesn't have to be an actual 'left | |
delimiter', that is you can use '\left)' if there were some reason | |
to do it.""" | |
# ps,pt=kldiv(tokenize(d1), tokenize(d2)) | |
# print sum(ps) | |
# print sum(pt) | |
print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) | |
print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment