interrogator/keywording.py

## keywording.py
# keyword calculation for a particular word

# our aim is to use log likelihood to calculate the 'keyness' of a word. common practice.
# a normal application would be to loop through wordlists and gives every word a keyness score.
# here, we'll just get 'apple', as an example

# reference_corpus = a dictionary of words and their frequencies in a large dataset
# target_corpus = a dictionary of words and their frequencies in a smaller dataset

# our example word

import math
k = 'apple'

# get count for apple in corpora, give 0 if not found
a = reference_corpus[k]
b = target_corpus[k]
# get the total wordcount from the corpora
c = sum(reference_corpus.values())
d = sum(target_corpus.values())

# i added this, because the original calculation mixed very key and very unkey words together
# it says, find out what percentage of all words in each corpus is apple
# if it's more in the reference corpus, set negative mode to on

neg = False
if (b / float(d)) < (a / float(c)):
    neg = True

# this is how keywords are calculated, i stole this code
E1 = float(c)*((float(a)+float(b))/ (float(c)+float(d)))
E2 = float(d)*((float(a)+float(b))/ (float(c)+float(d)))
if a == 0:
    logaE1 = 0
else:
    logaE1 = math.log(a/E1)
# notice nowhere does this distinguish between very key and unkey words
score = float(2* ((a*logaE1)+(b*math.log(b/E2))))

# i turn the score negative if need be
if neg:
    score = -score

print '%s: %s' % (k, str(score))

# result:
# apple: 503.4

# on the bipolar forum, it can be used to find out some differences between
# a corpus of first posts and a corpus of all posts:

i             1872.22
bipolar        788.83
diagnose       696.49
year           639.67
my             630.30
have           474.32
anyone         334.17
ago            308.06
month          296.86
and            279.72
recently       250.73
he             226.27
depression     215.42
medication     178.58
any            166.63
               ...
welcome       -231.32
good          -235.98
hug           -284.53
yourself      -286.81
seaturtle     -295.54
erin          -296.78
hope          -304.99
it            -380.56
will          -383.42
kat           -402.22
goody         -443.06
we            -484.41
that          -689.90
pdoc          -968.06
you          -7277.12
Name: 01: keyness, dtype: float64

# jargon and veteran member names are the most unkey, because they become frequent in later posts
# everyone enters by saying 'i was diagnosed last week' etc, hence the positive keywords
# this example has already stemmed words, i also do it for just 'participants', 'events', etc.,
	# keyword calculation for a particular word

	# our aim is to use log likelihood to calculate the 'keyness' of a word. common practice.
	# a normal application would be to loop through wordlists and gives every word a keyness score.
	# here, we'll just get 'apple', as an example

	# reference_corpus = a dictionary of words and their frequencies in a large dataset
	# target_corpus = a dictionary of words and their frequencies in a smaller dataset

	# our example word

	import math
	k = 'apple'

	# get count for apple in corpora, give 0 if not found
	a = reference_corpus[k]
	b = target_corpus[k]
	# get the total wordcount from the corpora
	c = sum(reference_corpus.values())
	d = sum(target_corpus.values())

	# i added this, because the original calculation mixed very key and very unkey words together
	# it says, find out what percentage of all words in each corpus is apple
	# if it's more in the reference corpus, set negative mode to on

	neg = False
	if (b / float(d)) < (a / float(c)):
	neg = True

	# this is how keywords are calculated, i stole this code
	E1 = float(c)*((float(a)+float(b))/ (float(c)+float(d)))
	E2 = float(d)*((float(a)+float(b))/ (float(c)+float(d)))
	if a == 0:
	logaE1 = 0
	else:
	logaE1 = math.log(a/E1)
	# notice nowhere does this distinguish between very key and unkey words
	score = float(2* ((alogaE1)+(bmath.log(b/E2))))

	# i turn the score negative if need be
	if neg:
	score = -score

	print '%s: %s' % (k, str(score))

	# result:
	# apple: 503.4

	# on the bipolar forum, it can be used to find out some differences between
	# a corpus of first posts and a corpus of all posts:

	i 1872.22
	bipolar 788.83
	diagnose 696.49
	year 639.67
	my 630.30
	have 474.32
	anyone 334.17
	ago 308.06
	month 296.86
	and 279.72
	recently 250.73
	he 226.27
	depression 215.42
	medication 178.58
	any 166.63
	...
	welcome -231.32
	good -235.98
	hug -284.53
	yourself -286.81
	seaturtle -295.54
	erin -296.78
	hope -304.99
	it -380.56
	will -383.42
	kat -402.22
	goody -443.06
	we -484.41
	that -689.90
	pdoc -968.06
	you -7277.12
	Name: 01: keyness, dtype: float64

	# jargon and veteran member names are the most unkey, because they become frequent in later posts
	# everyone enters by saying 'i was diagnosed last week' etc, hence the positive keywords
	# this example has already stemmed words, i also do it for just 'participants', 'events', etc.,