Skip to content

Instantly share code, notes, and snippets.

@gamingflexer
Created August 28, 2022 14:47
Show Gist options
  • Save gamingflexer/44b22a186680201a5cc178c9c043fa01 to your computer and use it in GitHub Desktop.
Save gamingflexer/44b22a186680201a5cc178c9c043fa01 to your computer and use it in GitHub Desktop.
Bunch of Cleaning Functions | ML & Backend Dev
import os,re,string,json,emoji,csv
import numpy as np
import pandas as pd
def clean_text(text):
'''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = emoji.demojize(text)
text = re.sub(r'\:(.*?)\:', '', text)
text = str(text).lower() # Making Text Lowercase
text = re.sub('\[.*?\]', '', text)
# The next 2 lines remove html text
text = BeautifulSoup(text, 'lxml').get_text()
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
return text
def clean_contractions(text, mapping):
'''Clean contraction using contraction mapping'''
specials = ["’", "‘", "´", "`"]
for s in specials:
text = text.replace(s, "'")
for word in mapping.keys():
if ""+word+"" in text:
text = text.replace(""+word+"", ""+mapping[word]+"")
# Remove Punctuations
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
# creating a space between a word and the punctuation following it
# eg: "he is a boy." => "he is a boy ."
text = re.sub(r"([?.!,¿])", r" \1 ", text)
text = re.sub(r'[" "]+', " ", text)
return text
def clean_special_chars(text, punct, mapping):
'''Cleans special characters present(if any)'''
for p in mapping:
text = text.replace(p, mapping[p])
for p in punct:
text = text.replace(p, f' {p} ')
specials = {'\u200b': ' ', '…': ' ... ',
'\ufeff': '', 'करना': '', 'है': ''}
for s in specials:
text = text.replace(s, specials[s])
return text
def correct_spelling(x, dic):
'''Corrects common spelling errors'''
for word in dic.keys():
x = x.replace(word, dic[word])
return x
def remove_space(text):
'''Removes awkward spaces'''
# Removes awkward spaces
text = text.strip()
text = text.split()
return " ".join(text)
def text_preprocessing_pipeline(text):
'''Cleaning and parsing the text.'''
text = clean_text(text)
text = clean_contractions(text, contraction_mapping)
text = clean_special_chars(text, punct, punct_mapping)
text = correct_spelling(text, mispell_dict)
text = remove_space(text)
return text
# hexcode
def hexcode_clean(text):
text = (r'[^\x00-\x7f]', r'', text)
return text
#clean temp dir for files
def cleandir(path):
for root, dirs, files in os.walk(path):
for currentFile in files:
exts = ('.json', '.csv','.png')
if currentFile.lower().endswith(exts):
os.remove(os.path.join(root, currentFile))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment