Skip to content

Instantly share code, notes, and snippets.

@jkff
Created December 22, 2017 06:43
Show Gist options
  • Save jkff/e2f2b90664efbefccb28e032393a679a to your computer and use it in GitHub Desktop.
Save jkff/e2f2b90664efbefccb28e032393a679a to your computer and use it in GitHub Desktop.
import nltk
import pickle
import random
# I cleaned up the data manually in Vim.
lines = list(open('movie_lines.tsv').readlines())
random.shuffle(lines)
tagged = [
# Split lines into sentences; split sentences into words; tag words with
# part of speech (POS).
nltk.pos_tag(nltk.word_tokenize(sentence))
for line in lines
for sentence in nltk.sent_tokenize(line.decode('utf8'))]
# nltk is pretty slow, good idea to save the result and maybe load it later
# to play with it without redoing the POS tagging.
pickle.dump(tagged, open('movie_lines_tagged.p', 'wb'))
cleaned = [
[word.lower()
for word, pos in tagged_phrase
# Exclude singular and plural proper nouns: they make up about 50% of
# the unique words. We could do more cleaning, e.g. normalize word forms.
# That depends on the learning goal and the language.
if pos not in ('NNP', 'NNPS')]
for tagged_phrase in tagged]
pickle.dump(cleaned, open('movie_lines_cleaned.p', 'wb'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment