yy/logodds.py

## logodds.py
def logodds(corpora_dic, bg_counter):
    """ It calculates the log odds ratio of term i's frequency between
    a target corpus and another corpus, with the prior information from
    a background corpus. Inputs are:

    - a dictionary of Counter objects (corpora of our interest)
    - a Counter objects (background corpus)

    Output is a dictionary of dictionaries. Each dictionary contains the log
    odds ratio of each word.

    """
    corp_size = dict([(c, sum(corpora_dic[c].values())) for c in corpora_dic])
    bg_size = sum(bg_counter.values())
    result = dict([(c, {}) for c in corpora_dic])

    for name, c in corpora_dic.items():
        for word in c:
            #if 10 > sum(1 for corpus in corpora_dic.values() if corpus[word]):
            #    continue

            fi = c[word]
            fj = sum(co[word] for x, co in corpora_dic.items() if x != name)
            fbg = bg_counter[word]
            ni = corp_size[name]
            nj = sum(x for idx, x in corp_size.items() if idx != name)
            nbg = bg_size
            oddsratio = log(fi+fbg) - log(ni+nbg-(fi+fbg)) -\
                        log(fj+fbg) + log(nj+nbg-(fj+fbg))
            std = 1.0 / (fi+fbg) + 1.0 / (fj+fbg)
            z = oddsratio / sqrt(std)
            result[name][word] = z

    return result
	def logodds(corpora_dic, bg_counter):
	""" It calculates the log odds ratio of term i's frequency between
	a target corpus and another corpus, with the prior information from
	a background corpus. Inputs are:

	- a dictionary of Counter objects (corpora of our interest)
	- a Counter objects (background corpus)

	Output is a dictionary of dictionaries. Each dictionary contains the log
	odds ratio of each word.

	"""
	corp_size = dict([(c, sum(corpora_dic[c].values())) for c in corpora_dic])
	bg_size = sum(bg_counter.values())
	result = dict([(c, {}) for c in corpora_dic])

	for name, c in corpora_dic.items():
	for word in c:
	#if 10 > sum(1 for corpus in corpora_dic.values() if corpus[word]):
	# continue

	fi = c[word]
	fj = sum(co[word] for x, co in corpora_dic.items() if x != name)
	fbg = bg_counter[word]
	ni = corp_size[name]
	nj = sum(x for idx, x in corp_size.items() if idx != name)
	nbg = bg_size
	oddsratio = log(fi+fbg) - log(ni+nbg-(fi+fbg)) -\
	log(fj+fbg) + log(nj+nbg-(fj+fbg))
	std = 1.0 / (fi+fbg) + 1.0 / (fj+fbg)
	z = oddsratio / sqrt(std)
	result[name][word] = z

	return result