Last active
November 1, 2017 13:02
-
-
Save amnrzv/596ba910524e0b1b4e8fa2167fd773bf to your computer and use it in GitHub Desktop.
Python NLTK vocabulary analysis example.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import re | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import WordNetLemmatizer | |
input_file = "./input.txt" | |
words_file = "./words.txt" | |
output_file = "./output.txt" | |
curriculum_words = [] | |
pos_tagged_array = [] | |
base_words = [] | |
wordnet_lemmatizer = WordNetLemmatizer() | |
def process_words_file(): | |
""" | |
Read the words file and store the words in a list | |
""" | |
global curriculum_words | |
curriculum_words = [] | |
with open(words_file) as curriculum_file: | |
try: | |
for line in curriculum_file: | |
curriculum_words.append(line.strip()) | |
except Exception as e: | |
print (e) | |
def process_input(): | |
""" | |
Read the input file and tokenize and POS_tag the words | |
""" | |
global pos_tagged_array | |
with open(input_file) as input_text: | |
try: | |
for line in input_text: | |
if line.strip(): | |
words = word_tokenize(line) | |
for tag in nltk.pos_tag(words): | |
# eliminating unnecessary POS tags | |
match = re.search('\w.*', tag[1]) | |
if match: | |
pos_tagged_array.append(tag) | |
except Exception as e: | |
print (e) | |
def lemmatize_words(): | |
""" | |
Convert each word in the input to its base form | |
and save it in a list | |
""" | |
global base_words | |
for tag in pos_tagged_array: | |
base_word = wordnet_lemmatizer.lemmatize(tag[0].lower(), 'v') | |
base_words.append(base_word) | |
def analyze_input(): | |
""" | |
Find count of words from the curriculum_words list | |
in the base_words list | |
""" | |
output = open(output_file, 'w') | |
for curriculum_word in curriculum_words: | |
count = base_words.count(curriculum_word) | |
output.write("%-15s | %10s\n" % (curriculum_word, str(count))) | |
output.close() | |
process_words_file() | |
process_input() | |
lemmatize_words() | |
analyze_input() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Ruby... Ruby, can you hear me? | |
Moli? Moli, where are you? | |
Moli? | |
Ruby, I've crashed. | |
Yeah... But where? | |
I'm hurt, Ruby. Can you find me? | |
Okay, I can see plants. | |
I can see rocks. | |
I can see water. | |
I can't see aliens... Am I alone here? | |
I'm hungry. | |
I have food! | |
Sandwiches. Fruit. Salad. And... chocolate! | |
Hmmm... I'm thirsty. | |
It's not bad. | |
I can't cross. | |
Hmmm... | |
Okay, think. Think! | |
My Molicom. I remember the password! | |
I have to use my Molicom. | |
My name is Ruby Rei. | |
I'm a space explorer. | |
This is Moli 3. | |
She's a robot. | |
And she's my best friend. | |
We love travelling. | |
And we love adventure! | |
My name is Ruby Rei. | |
And my life is... interesting. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
act | |
be | |
begin | |
believe | |
break | |
call | |
can | |
change | |
choose | |
clean | |
come | |
cross | |
decide | |
do | |
drink | |
drive | |
drop | |
eat | |
end | |
enjoy | |
explore | |
fall | |
feel | |
find | |
finish | |
fish | |
fix | |
fly | |
follow | |
forget | |
get | |
give | |
glue | |
go | |
grow | |
guess | |
happen | |
hate | |
have | |
hear | |
help | |
hide | |
hit | |
hold | |
hop | |
hope | |
hurt | |
improve | |
join | |
jump | |
keep | |
kick | |
know | |
land | |
learn | |
let | |
lie | |
like | |
listen | |
live | |
look | |
love |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
act | 0 | |
be | 6 | |
begin | 0 | |
believe | 0 | |
break | 0 | |
call | 0 | |
can | 5 | |
change | 0 | |
choose | 0 | |
clean | 0 | |
come | 0 | |
cross | 1 | |
decide | 0 | |
do | 0 | |
drink | 0 | |
drive | 0 | |
drop | 0 | |
eat | 0 | |
end | 0 | |
enjoy | 0 | |
explore | 0 | |
fall | 0 | |
feel | 0 | |
find | 1 | |
finish | 0 | |
fish | 0 | |
fix | 0 | |
fly | 0 | |
follow | 0 | |
forget | 0 | |
get | 0 | |
give | 0 | |
glue | 0 | |
go | 0 | |
grow | 0 | |
guess | 0 | |
happen | 0 | |
hate | 0 | |
have | 2 | |
hear | 1 | |
help | 0 | |
hide | 0 | |
hit | 0 | |
hold | 0 | |
hop | 0 | |
hope | 0 | |
hurt | 1 | |
improve | 0 | |
join | 0 | |
jump | 0 | |
keep | 0 | |
kick | 0 | |
know | 0 | |
land | 0 | |
learn | 0 | |
let | 0 | |
lie | 0 | |
like | 0 | |
listen | 0 | |
live | 0 | |
look | 0 | |
love | 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment