bdewilde/basic_text_cleaning.py

## basic_text_cleaning.py
def clean_text(text):

    from nltk import clean_html
    import re

    # strip html markup with handy NLTK function
    text = clean_html(text)
    # remove digits with regular expression
    text = re.sub(r'\d', ' ', text)
    # remove any patterns matching standard url format
    url_pattern = r'((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?'
    text = re.sub(url_pattern, ' ', text)
    # remove all non-ascii characters
    text = ''.join(character for character in text if ord(character)<128)
    # standardize white space
    text = re.sub(r'\s+', ' ', text)
    # drop capitalization
    text = text.lower()

    return text
	def clean_text(text):

	from nltk import clean_html
	import re

	# strip html markup with handy NLTK function
	text = clean_html(text)
	# remove digits with regular expression
	text = re.sub(r'\d', ' ', text)
	# remove any patterns matching standard url format
	url_pattern = r'((http\|ftp\|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?'
	text = re.sub(url_pattern, ' ', text)
	# remove all non-ascii characters
	text = ''.join(character for character in text if ord(character)<128)
	# standardize white space
	text = re.sub(r'\s+', ' ', text)
	# drop capitalization
	text = text.lower()

	return text