amundo/brown_vs_reuters_pronouns.py

## brown_vs_reuters_pronouns.py
import nltk
from nltk.corpus import brown, reuters

pronouns = 'I me my us we he she it her his its they them their'.split()

#news_text = brown.words(categories='religion')

brown_text = brown.words(categories='news')
reuters_text = reuters.words()

brown_dist = nltk.FreqDist([w.lower() for w in brown_text])
reuters_dist = nltk.FreqDist([w.lower() for w in reuters_text])

def starbars(fdist, terms, scaling_factor=10):
    chart = ''
    # worst bar chart in human history
    for p in pronouns:
        chart += p + '\t' + ('*' * int(fdist[p] / scaling_factor)) + ' ' + str(fdist[p]) + '\n'
    return chart

print 'BROWN'
print starbars(brown_dist, pronouns)
print '\nREUTERS'
print starbars(reuters_dist, pronouns, scaling_factor=200)

## brown_vs_reuters_pronouns.txt
BROWN
I        0
me      *** 31
my      **** 43
us      * 13
we      ********** 107
he      **************************************************************** 642
she     ******* 77
it      *********************************************** 478
her     ************ 121
his     ****************************************** 428
its     ***************** 178
they    ************************** 267
them    ********* 96
their   *********************** 231


REUTERS
I        0
me       35
my       56
us      * 203
we      ******* 1599
he      ************************** 5215
she      163
it      ******************************************************* 11104
her      10
his     *** 673
its     ************************************* 7402
they    ************ 2595
them    ** 402
their   ****** 1370
	import nltk
	from nltk.corpus import brown, reuters

	pronouns = 'I me my us we he she it her his its they them their'.split()

	#news_text = brown.words(categories='religion')

	brown_text = brown.words(categories='news')
	reuters_text = reuters.words()

	brown_dist = nltk.FreqDist([w.lower() for w in brown_text])
	reuters_dist = nltk.FreqDist([w.lower() for w in reuters_text])

	def starbars(fdist, terms, scaling_factor=10):
	chart = ''
	# worst bar chart in human history
	for p in pronouns:
	chart += p + '\t' + ('' int(fdist[p] / scaling_factor)) + ' ' + str(fdist[p]) + '\n'
	return chart

	print 'BROWN'
	print starbars(brown_dist, pronouns)
	print '\nREUTERS'
	print starbars(reuters_dist, pronouns, scaling_factor=200)
	BROWN
	I 0
	me *** 31
	my **** 43
	us * 13
	we ********** 107
	he **************************************************************** 642
	she ******* 77
	it *********************************************** 478
	her ************ 121
	his ****************************************** 428
	its ***************** 178
	they ************************** 267
	them ********* 96
	their *********************** 231


	REUTERS
	I 0
	me 35
	my 56
	us * 203
	we ******* 1599
	he ************************** 5215
	she 163
	it ******************************************************* 11104
	her 10
	his *** 673
	its ************************************* 7402
	they ************ 2595
	them ** 402
	their ****** 1370