Preprocess the data
tokenized_punctuation = { | |
'.' : '||Period||', | |
',' : '||Comma||', | |
'"' : '||Quotation_Mark||', | |
';' : '||Semicolon||', | |
'!' : '||Exclamation_Mark||', | |
'?' : '||Question_Mark||', | |
'(' : '||Left_Parentheses||', | |
')' : '||Right_Parentheses||', | |
'--' : '||Dash||', | |
'\n' : '||Return||' | |
} | |
text = "\n".join(clean_text) | |
for key, token in tokenized_punctuation .items(): | |
text = text.replace(key, ' {} '.format(token)) | |
text = text.lower() | |
text = text.split() | |
word_counts = Counter(text) | |
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) | |
int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)} | |
vocab_to_int = {word: ii for ii, word in int_to_vocab.items()} | |
int_text = [vocab_to_int[word] for word in text] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment