tcramm0nd/text_preprocessing.py

## text_preprocessing.py
def text_preprocessing(text):
    '''This is a pretty basic NLP text cleaner that takes in a corpus/text,
    applies some cleaning functions to remove undesirable
    characters, and returns the text in the same format

    This requires the following to be imported:
    re
    nltk
    '''
    # text cleaning
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)

    # splits the text into tokens using a regular expression
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokenized_text = tokenizer.tokenize(text)
    # a list of stopwords to remove
    stop_words = set(stopwords.words('english'))
    # creates a new list of tokenized words without the stopwords
    no_stop_words = [w for w in tokenized_text if w not in stop_words]

    text = ' '.join(no_stop_words)

    return text
	def text_preprocessing(text):
	'''This is a pretty basic NLP text cleaner that takes in a corpus/text,
	applies some cleaning functions to remove undesirable
	characters, and returns the text in the same format

	This requires the following to be imported:
	re
	nltk
	'''
	# text cleaning
	text = text.lower()
	text = re.sub('\[.*?\]', '', text)
	text = re.sub('https?://\S+\|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w\d\w', '', text)

	# splits the text into tokens using a regular expression
	tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
	tokenized_text = tokenizer.tokenize(text)
	# a list of stopwords to remove
	stop_words = set(stopwords.words('english'))
	# creates a new list of tokenized words without the stopwords
	no_stop_words = [w for w in tokenized_text if w not in stop_words]

	text = ' '.join(no_stop_words)

	return text