Skip to content

Instantly share code, notes, and snippets.

@tomachalek
Created May 4, 2015 09:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomachalek/2e6fdce35e565ee2dccf to your computer and use it in GitHub Desktop.
Save tomachalek/2e6fdce35e565ee2dccf to your computer and use it in GitHub Desktop.
KonText vs. Manatee - multi-dimensional frequency distribution
"""
KonText vs. Manatee - multi-dimensional frequency distribution
words = manatee.StrVector()
freqs = manatee.NumVector()
norms = manatee.NumVector()
crit = 'opus.rokvyd 0 opus.genre 0'
corpus.freq_dist(range_stream, crit, limit, words, freqs, norms)
"""
import re
# this is what Manatee returns
words = [u'2006\tTRV', u'2007\tSPO', u'2000\tX', u'2003\tJUN', u'1995\tMEM', u'2009\tREG', u'2002\tTRV',
u'2008\tX', u'1992\tX', u'1992\tY', u'2009\tHOU', u'1996\tPSY', u'1996\tART', u'2000\tCHR', u'2000\tART']
freqs = [12, 1, 35, 6, 1, 2, 4, 4, 8, 2, 1, 1, 1, 1, 1]
def normalize(words, freqs):
"""
normalizes two vectors (one for independent variables and one for dependent ones)
arguments:
words -- ['x1\ty1', 'x2\ty2',..., 'xN\tyN']
freqs -- [v1,...,vN]
"""
def custom_cmp(a, b, idx=0):
ans = cmp(a[0][idx], b[0][idx])
if ans == 0 and idx + 1 < len(a):
ans = custom_cmp(a, b, idx + 1)
return ans
return sorted(zip([tuple(re.split(r'\s+', v)) for v in words], freqs), cmp=custom_cmp)
if __name__ == '__main__':
data = normalize(words, freqs)
print(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment