chricke/preprocess_text.py

## preprocess_text.py
tokenized_punctuation = {
    '.' : '||Period||',
    ',' : '||Comma||',
    '"' : '||Quotation_Mark||',
    ';' : '||Semicolon||',
    '!' : '||Exclamation_Mark||',
    '?' : '||Question_Mark||',
    '(' : '||Left_Parentheses||',
    ')' : '||Right_Parentheses||',
    '--' : '||Dash||',
    '\n' : '||Return||'
}

text = "\n".join(clean_text)

for key, token in tokenized_punctuation .items():
    text = text.replace(key, ' {} '.format(token))

text = text.lower()
text = text.split()

word_counts = Counter(text)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

int_text = [vocab_to_int[word] for word in text]
	tokenized_punctuation = {
	'.' : '\|\|Period\|\|',
	',' : '\|\|Comma\|\|',
	'"' : '\|\|Quotation_Mark\|\|',
	';' : '\|\|Semicolon\|\|',
	'!' : '\|\|Exclamation_Mark\|\|',
	'?' : '\|\|Question_Mark\|\|',
	'(' : '\|\|Left_Parentheses\|\|',
	')' : '\|\|Right_Parentheses\|\|',
	'--' : '\|\|Dash\|\|',
	'\n' : '\|\|Return\|\|'
	}

	text = "\n".join(clean_text)

	for key, token in tokenized_punctuation .items():
	text = text.replace(key, ' {} '.format(token))

	text = text.lower()
	text = text.split()

	word_counts = Counter(text)
	sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
	int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
	vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

	int_text = [vocab_to_int[word] for word in text]