parajain/data_cleaning.py

## data_cleaning.py
'''
Basic text data cleaning script
Tokenization, remove punctuation
'''

import sys
import re
import string
from nltk.tokenize import word_tokenize

sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."]

def filter_sentences(sentences):
  tokenized_sentences = [word_tokenize(s) for s in sentences]

  regex = re.compile('[%s]' % re.escape(string.punctuation))
  tokenized_reports_no_punctuation = []

  for s in tokenized_sentences:

      new_s = []
      for token in s:
          new_token = regex.sub(u'', token)
          if not new_token == u'':
              new_s.append(new_token.lower())

      tokenized_reports_no_punctuation.append(new_s)

  print(tokenized_reports_no_punctuation)
  return tokenized_reports_no_punctuation

filter_sentences(sentences)
	'''
	Basic text data cleaning script
	Tokenization, remove punctuation
	'''

	import sys
	import re
	import string
	from nltk.tokenize import word_tokenize

	sentences = ["A Socrates would perhaps have refused and died in the name of truth.","Hume, however, was not going to allow the stupidity of others to cut his own life short, so he did what any sensible person should do: he went along with their request without any intention of keeping his promise."]

	def filter_sentences(sentences):
	tokenized_sentences = [word_tokenize(s) for s in sentences]

	regex = re.compile('[%s]' % re.escape(string.punctuation))
	tokenized_reports_no_punctuation = []

	for s in tokenized_sentences:

	new_s = []
	for token in s:
	new_token = regex.sub(u'', token)
	if not new_token == u'':
	new_s.append(new_token.lower())

	tokenized_reports_no_punctuation.append(new_s)

	print(tokenized_reports_no_punctuation)
	return tokenized_reports_no_punctuation

	filter_sentences(sentences)