Skip to content

Instantly share code, notes, and snippets.

@obikag
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save obikag/7275e02292987dee107b to your computer and use it in GitHub Desktop.
Save obikag/7275e02292987dee107b to your computer and use it in GitHub Desktop.
Text Analysis object class implemented in Python. The class methods return the frequencies of word types and n-grams in the sample text. Additionally, the number of words in the sample text is calculated.
'''
Created on Feb 24, 2015
Sample text is an excerpt from Chapter 2 of the Adventures of Huckleberry Finn
'''
import operator, string
class TextParser:
def __init__(self):
self._word_types = {}
self._n_grams = {}
self._word_count = 0
# Method returns Word Types sorted by highest frequency
def get_word_types(self,content):
self._word_types = {}
content = self._create_word_list(content)
for a_word in content:
if a_word in self._word_types:
self._word_types[a_word] += 1
else:
self._word_types[a_word] = 1
self._word_type_count = len(self._word_types)
sorted_word_types = sorted(self._word_types.iteritems(), key=operator.itemgetter(1), reverse = True)
return sorted_word_types
# Method returns N-grams sorted by highest frequency
def get_n_grams(self,content,n):
self._n_grams = {}
words = self._create_word_list(content)
for i in range(len(words)-n+1):
gram = ' '.join(words[i:i+n])
self._n_grams.setdefault(gram, 0)
self._n_grams[gram] += 1
self._n_gram_count = len(self._n_grams)
sorted_n_grams = sorted(self._n_grams.iteritems(), key=operator.itemgetter(1), reverse = True)
return sorted_n_grams
# Method returns list of words from the input text and performs the word count
def _create_word_list(self,text):
self._word_count = 0
text = text.replace(',', '')
text = text.split()
for word in text:
self._word_count += 1
temp = word.lower()
n_word = temp.translate(string.maketrans("",""), string.punctuation)
text[text.index(word)] = n_word
return text
# Method returns Word Count
def get_word_count(self):
return self._word_count
# Method returns Word Type Count
def get_word_type_count(self):
return len(self._word_types)
# Method returns N-gram Count
def get_n_gram_count(self):
return len(self._n_grams)
#**********Test Section****************
txt = '''Huck and Tom meet the rest of the town boys, and they all go to a hidden cave two miles down the river. In the cave,
Tom declares that the band of robbers will be called "Tom Sawyer's Gang" and "Everybody that wants to join has got to take an oath,
and write his name in blood." The boys all swear that, if a gang member tells the gang's secrets, they will cut his throat and then kill that boy's family.
One of the boys says the oath is not fair because Huck Finn does not have a family unless you count a father who can never be found.
A solution is found when Huck offers Miss Watson as his family and says, "they could kill her."'''
w = TextParser()
test = w.get_word_types(txt)
test2 = w.get_n_grams(txt,2)
test3 = w.get_n_grams(txt,3)
print'*****Word Types*****'
for word, freq in test:
print(word+' '+str(freq))
print'******Bi-grams******'
for phrase, freq in test2:
print(phrase+' '+str(freq))
print'******Tri-grams*****'
for phrase, freq in test3:
print(phrase+' '+str(freq))
print'********Count*******'
print 'Word Count = '+str(w.get_word_count())
print 'Word Type Count = '+str(w.get_word_type_count())
print 'Word N-gram Count = '+str(w.get_n_gram_count())
'''
**********************Output********************************
*****Word Types*****
the 9
and 6
a 5
that 4
boys 4
family 3
huck 3
to 3
tom 3
his 3
they 3
of 3
all 2
says 2
be 2
is 2
kill 2
in 2
cave 2
found 2
not 2
oath 2
will 2
gang 2
because 1
gangs 1
when 1
finn 1
one 1
rest 1
sawyers 1
down 1
throat 1
as 1
declares 1
have 1
go 1
miss 1
tells 1
if 1
everybody 1
member 1
fair 1
her 1
offers 1
father 1
two 1
write 1
swear 1
take 1
then 1
got 1
hidden 1
has 1
can 1
wants 1
unless 1
who 1
solution 1
watson 1
cut 1
blood 1
an 1
you 1
never 1
town 1
join 1
name 1
band 1
could 1
count 1
secrets 1
miles 1
does 1
meet 1
river 1
robbers 1
called 1
******Bi-grams******
of the 2
the boys 2
if a 1
take an 1
huck offers 1
the river 1
family one 1
oath is 1
will cut 1
to join 1
join has 1
found when 1
huck finn 1
all go 1
tom sawyers 1
tells the 1
his name 1
they will 1
town boys 1
miss watson 1
never be 1
in the 1
watson as 1
could kill 1
and everybody 1
family unless 1
and write 1
be found 1
not fair 1
gang member 1
cave tom 1
one of 1
cut his 1
can never 1
will be 1
tom declares 1
throat and 1
hidden cave 1
family and 1
is found 1
cave two 1
name in 1
blood the 1
called tom 1
to a 1
fair because 1
does not 1
the cave 1
rest of 1
everybody that 1
father who 1
two miles 1
swear that 1
when huck 1
gangs secrets 1
that boys 1
a hidden 1
and they 1
says the 1
that if 1
and then 1
in blood 1
is not 1
oath and 1
member tells 1
declares that 1
they all 1
boys all 1
boys family 1
they could 1
solution is 1
be called 1
the gangs 1
to take 1
the rest 1
found a 1
then kill 1
meet the 1
gang and 1
and tom 1
of robbers 1
tom meet 1
the town 1
his throat 1
down the 1
because huck 1
a family 1
not have 1
has got 1
wants to 1
and says 1
offers miss 1
river in 1
unless you 1
as his 1
have a 1
kill that 1
a father 1
boys and 1
robbers will 1
a gang 1
the band 1
finn does 1
band of 1
that wants 1
you count 1
count a 1
secrets they 1
all swear 1
the oath 1
miles down 1
go to 1
that the 1
an oath 1
who can 1
his family 1
a solution 1
boys says 1
huck and 1
got to 1
sawyers gang 1
write his 1
says they 1
kill her 1
******Tri-grams*****
town boys and 1
robbers will be 1
secrets they will 1
a gang member 1
to join has 1
kill that boys 1
a family unless 1
of robbers will 1
that wants to 1
all swear that 1
down the river 1
cave tom declares 1
could kill her 1
rest of the 1
watson as his 1
huck offers miss 1
got to take 1
band of robbers 1
go to a 1
boys says the 1
his throat and 1
boys family one 1
they will cut 1
fair because huck 1
all go to 1
oath is not 1
then kill that 1
his name in 1
swear that if 1
count a father 1
family unless you 1
the river in 1
you count a 1
father who can 1
cave two miles 1
will be called 1
a father who 1
family one of 1
hidden cave two 1
because huck finn 1
boys all swear 1
meet the rest 1
one of the 1
called tom sawyers 1
they all go 1
not fair because 1
the oath is 1
offers miss watson 1
the boys says 1
that the band 1
unless you count 1
and says they 1
declares that the 1
never be found 1
does not have 1
write his name 1
his family and 1
member tells the 1
found when huck 1
oath and write 1
is found when 1
and they all 1
a solution is 1
not have a 1
name in blood 1
huck finn does 1
tom sawyers gang 1
cut his throat 1
wants to join 1
as his family 1
they could kill 1
boys and they 1
river in the 1
the rest of 1
who can never 1
in the cave 1
if a gang 1
miss watson as 1
be found a 1
miles down the 1
throat and then 1
gang and everybody 1
that if a 1
and tom meet 1
and everybody that 1
everybody that wants 1
tom meet the 1
the boys all 1
and write his 1
sawyers gang and 1
has got to 1
tom declares that 1
says the oath 1
the gangs secrets 1
to a hidden 1
a hidden cave 1
blood the boys 1
join has got 1
can never be 1
is not fair 1
two miles down 1
take an oath 1
gang member tells 1
finn does not 1
family and says 1
and then kill 1
of the boys 1
the town boys 1
of the town 1
in blood the 1
the cave tom 1
says they could 1
will cut his 1
to take an 1
huck and tom 1
tells the gangs 1
be called tom 1
the band of 1
gangs secrets they 1
found a solution 1
when huck offers 1
an oath and 1
solution is found 1
have a family 1
that boys family 1
********Count*******
Word Count = 127
Word Type Count = 78
Word N-gram Count = 125
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment