Skip to content

Instantly share code, notes, and snippets.

@parajain
Last active September 7, 2018 18:36
Show Gist options
  • Save parajain/eee2eb6fb704bc8913ce76728c0cef61 to your computer and use it in GitHub Desktop.
Save parajain/eee2eb6fb704bc8913ce76728c0cef61 to your computer and use it in GitHub Desktop.
'''
Basic text data cleaning script
Tokenization, remove punctuation
'''
import sys
import re
import string
from nltk.tokenize import word_tokenize
sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."]
def filter_sentences(sentences):
tokenized_sentences = [word_tokenize(s) for s in sentences]
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_reports_no_punctuation = []
for s in tokenized_sentences:
new_s = []
for token in s:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_s.append(new_token.lower())
tokenized_reports_no_punctuation.append(new_s)
print(tokenized_reports_no_punctuation)
return tokenized_reports_no_punctuation
filter_sentences(sentences)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment