Created
May 4, 2015 09:13
-
-
Save tomachalek/2e6fdce35e565ee2dccf to your computer and use it in GitHub Desktop.
KonText vs. Manatee - multi-dimensional frequency distribution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
KonText vs. Manatee - multi-dimensional frequency distribution | |
words = manatee.StrVector() | |
freqs = manatee.NumVector() | |
norms = manatee.NumVector() | |
crit = 'opus.rokvyd 0 opus.genre 0' | |
corpus.freq_dist(range_stream, crit, limit, words, freqs, norms) | |
""" | |
import re | |
# this is what Manatee returns | |
words = [u'2006\tTRV', u'2007\tSPO', u'2000\tX', u'2003\tJUN', u'1995\tMEM', u'2009\tREG', u'2002\tTRV', | |
u'2008\tX', u'1992\tX', u'1992\tY', u'2009\tHOU', u'1996\tPSY', u'1996\tART', u'2000\tCHR', u'2000\tART'] | |
freqs = [12, 1, 35, 6, 1, 2, 4, 4, 8, 2, 1, 1, 1, 1, 1] | |
def normalize(words, freqs): | |
""" | |
normalizes two vectors (one for independent variables and one for dependent ones) | |
arguments: | |
words -- ['x1\ty1', 'x2\ty2',..., 'xN\tyN'] | |
freqs -- [v1,...,vN] | |
""" | |
def custom_cmp(a, b, idx=0): | |
ans = cmp(a[0][idx], b[0][idx]) | |
if ans == 0 and idx + 1 < len(a): | |
ans = custom_cmp(a, b, idx + 1) | |
return ans | |
return sorted(zip([tuple(re.split(r'\s+', v)) for v in words], freqs), cmp=custom_cmp) | |
if __name__ == '__main__': | |
data = normalize(words, freqs) | |
print(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment