mohdsanadzakirizvi/bert_clean.py

## bert_clean.py
import re

# clean text from noise
def clean_text(text):
    # filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)

    # remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # convert to lowercase to maintain consistency
    text = text.lower()

    return text

train['clean_text'] = train.tweet.apply(clean_text)
	import re

	# clean text from noise
	def clean_text(text):
	# filter to allow only alphabets
	text = re.sub(r'[^a-zA-Z\']', ' ', text)

	# remove Unicode characters
	text = re.sub(r'[^\x00-\x7F]+', '', text)

	# convert to lowercase to maintain consistency
	text = text.lower()

	return text

	train['clean_text'] = train.tweet.apply(clean_text)