Skip to content

Instantly share code, notes, and snippets.

@amnrzv
Last active November 1, 2017 13:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amnrzv/596ba910524e0b1b4e8fa2167fd773bf to your computer and use it in GitHub Desktop.
Save amnrzv/596ba910524e0b1b4e8fa2167fd773bf to your computer and use it in GitHub Desktop.
Python NLTK vocabulary analysis example.
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
input_file = "./input.txt"
words_file = "./words.txt"
output_file = "./output.txt"
curriculum_words = []
pos_tagged_array = []
base_words = []
wordnet_lemmatizer = WordNetLemmatizer()
def process_words_file():
"""
Read the words file and store the words in a list
"""
global curriculum_words
curriculum_words = []
with open(words_file) as curriculum_file:
try:
for line in curriculum_file:
curriculum_words.append(line.strip())
except Exception as e:
print (e)
def process_input():
"""
Read the input file and tokenize and POS_tag the words
"""
global pos_tagged_array
with open(input_file) as input_text:
try:
for line in input_text:
if line.strip():
words = word_tokenize(line)
for tag in nltk.pos_tag(words):
# eliminating unnecessary POS tags
match = re.search('\w.*', tag[1])
if match:
pos_tagged_array.append(tag)
except Exception as e:
print (e)
def lemmatize_words():
"""
Convert each word in the input to its base form
and save it in a list
"""
global base_words
for tag in pos_tagged_array:
base_word = wordnet_lemmatizer.lemmatize(tag[0].lower(), 'v')
base_words.append(base_word)
def analyze_input():
"""
Find count of words from the curriculum_words list
in the base_words list
"""
output = open(output_file, 'w')
for curriculum_word in curriculum_words:
count = base_words.count(curriculum_word)
output.write("%-15s | %10s\n" % (curriculum_word, str(count)))
output.close()
process_words_file()
process_input()
lemmatize_words()
analyze_input()
Ruby... Ruby, can you hear me?
Moli? Moli, where are you?
Moli?
Ruby, I've crashed.
Yeah... But where?
I'm hurt, Ruby. Can you find me?
Okay, I can see plants.
I can see rocks.
I can see water.
I can't see aliens... Am I alone here?
I'm hungry.
I have food!
Sandwiches. Fruit. Salad. And... chocolate!
Hmmm... I'm thirsty.
It's not bad.
I can't cross.
Hmmm...
Okay, think. Think!
My Molicom. I remember the password!
I have to use my Molicom.
My name is Ruby Rei.
I'm a space explorer.
This is Moli 3.
She's a robot.
And she's my best friend.
We love travelling.
And we love adventure!
My name is Ruby Rei.
And my life is... interesting.
act
be
begin
believe
break
call
can
change
choose
clean
come
cross
decide
do
drink
drive
drop
eat
end
enjoy
explore
fall
feel
find
finish
fish
fix
fly
follow
forget
get
give
glue
go
grow
guess
happen
hate
have
hear
help
hide
hit
hold
hop
hope
hurt
improve
join
jump
keep
kick
know
land
learn
let
lie
like
listen
live
look
love
act | 0
be | 6
begin | 0
believe | 0
break | 0
call | 0
can | 5
change | 0
choose | 0
clean | 0
come | 0
cross | 1
decide | 0
do | 0
drink | 0
drive | 0
drop | 0
eat | 0
end | 0
enjoy | 0
explore | 0
fall | 0
feel | 0
find | 1
finish | 0
fish | 0
fix | 0
fly | 0
follow | 0
forget | 0
get | 0
give | 0
glue | 0
go | 0
grow | 0
guess | 0
happen | 0
hate | 0
have | 2
hear | 1
help | 0
hide | 0
hit | 0
hold | 0
hop | 0
hope | 0
hurt | 1
improve | 0
join | 0
jump | 0
keep | 0
kick | 0
know | 0
land | 0
learn | 0
let | 0
lie | 0
like | 0
listen | 0
live | 0
look | 0
love | 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment