jkff/nltk_preprocess.py

## nltk_preprocess.py
import nltk
import pickle
import random

# I cleaned up the data manually in Vim.
lines = list(open('movie_lines.tsv').readlines())
random.shuffle(lines)
tagged = [
  # Split lines into sentences; split sentences into words; tag words with
  # part of speech (POS).
  nltk.pos_tag(nltk.word_tokenize(sentence))
  for line in lines
  for sentence in nltk.sent_tokenize(line.decode('utf8'))]

# nltk is pretty slow, good idea to save the result and maybe load it later
# to play with it without redoing the POS tagging.
pickle.dump(tagged, open('movie_lines_tagged.p', 'wb'))

cleaned = [
  [word.lower()
   for word, pos in tagged_phrase
   # Exclude singular and plural proper nouns: they make up about 50% of
   # the unique words. We could do more cleaning, e.g. normalize word forms.
   # That depends on the learning goal and the language.
   if pos not in ('NNP', 'NNPS')]
  for tagged_phrase in tagged]

pickle.dump(cleaned, open('movie_lines_cleaned.p', 'wb'))
	import nltk
	import pickle
	import random

	# I cleaned up the data manually in Vim.
	lines = list(open('movie_lines.tsv').readlines())
	random.shuffle(lines)
	tagged = [
	# Split lines into sentences; split sentences into words; tag words with
	# part of speech (POS).
	nltk.pos_tag(nltk.word_tokenize(sentence))
	for line in lines
	for sentence in nltk.sent_tokenize(line.decode('utf8'))]

	# nltk is pretty slow, good idea to save the result and maybe load it later
	# to play with it without redoing the POS tagging.
	pickle.dump(tagged, open('movie_lines_tagged.p', 'wb'))

	cleaned = [
	[word.lower()
	for word, pos in tagged_phrase
	# Exclude singular and plural proper nouns: they make up about 50% of
	# the unique words. We could do more cleaning, e.g. normalize word forms.
	# That depends on the learning goal and the language.
	if pos not in ('NNP', 'NNPS')]
	for tagged_phrase in tagged]

	pickle.dump(cleaned, open('movie_lines_cleaned.p', 'wb'))