duttashi/cleaning_text_data_using_regex.py

## cleaning_text_data_using_regex.py
# suppose the text data is loaded in a dataframe called, df.
# using regular expressions to clean the text data

#Remove twitter handlers
df.text = df.text.apply(lambda x:re.sub('@[^\s]+','',x))

#remove hashtags
df.text = df.text.apply(lambda x:re.sub(r'\B#\S+','',x))

# Remove URLS
df.text = df.text.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all the special characters
df.text = df.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))

# Substituting multiple spaces with single space
df.text = df.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))
	# suppose the text data is loaded in a dataframe called, df.
	# using regular expressions to clean the text data

	#Remove twitter handlers
	df.text = df.text.apply(lambda x:re.sub('@[^\s]+','',x))

	#remove hashtags
	df.text = df.text.apply(lambda x:re.sub(r'\B#\S+','',x))

	# Remove URLS
	df.text = df.text.apply(lambda x:re.sub(r"http\S+", "", x))

	# Remove all the special characters
	df.text = df.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))

	# Substituting multiple spaces with single space
	df.text = df.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))