gamingflexer/preprocess.py

## preprocess.py
import os,re,string,json,emoji,csv
import numpy as np
import pandas as pd

def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:', '', text)
    text = str(text).lower()  # Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    # The next 2 lines remove html text
    text = BeautifulSoup(text, 'lxml').get_text()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text


def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    # Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text


def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''
    for p in mapping:
        text = text.replace(p, mapping[p])

    for p in punct:
        text = text.replace(p, f' {p} ')

    specials = {'\u200b': ' ', '…': ' ... ',
                '\ufeff': '', 'करना': '', 'है': ''}
    for s in specials:
        text = text.replace(s, specials[s])

    return text


def correct_spelling(x, dic):
    '''Corrects common spelling errors'''
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x


def remove_space(text):
    '''Removes awkward spaces'''
    # Removes awkward spaces
    text = text.strip()
    text = text.split()
    return " ".join(text)


def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text


# hexcode
def hexcode_clean(text):
    text = (r'[^\x00-\x7f]', r'', text)
    return text

#clean temp dir for files
def cleandir(path):
    for root, dirs, files in os.walk(path):
        for currentFile in files:
            exts = ('.json', '.csv','.png')
            if currentFile.lower().endswith(exts):
                os.remove(os.path.join(root, currentFile))
	import os,re,string,json,emoji,csv
	import numpy as np
	import pandas as pd

	def clean_text(text):
	'''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
	and remove words containing numbers.'''
	text = emoji.demojize(text)
	text = re.sub(r'\:(.*?)\:', '', text)
	text = str(text).lower() # Making Text Lowercase
	text = re.sub('\[.*?\]', '', text)
	# The next 2 lines remove html text
	text = BeautifulSoup(text, 'lxml').get_text()
	text = re.sub('https?://\S+\|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w\d\w', '', text)
	# replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
	text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
	return text


	def clean_contractions(text, mapping):
	'''Clean contraction using contraction mapping'''
	specials = ["’", "‘", "´", "`"]
	for s in specials:
	text = text.replace(s, "'")
	for word in mapping.keys():
	if ""+word+"" in text:
	text = text.replace(""+word+"", ""+mapping[word]+"")
	# Remove Punctuations
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	# creating a space between a word and the punctuation following it
	# eg: "he is a boy." => "he is a boy ."
	text = re.sub(r"([?.!,¿])", r" \1 ", text)
	text = re.sub(r'[" "]+', " ", text)
	return text


	def clean_special_chars(text, punct, mapping):
	'''Cleans special characters present(if any)'''
	for p in mapping:
	text = text.replace(p, mapping[p])

	for p in punct:
	text = text.replace(p, f' {p} ')

	specials = {'\u200b': ' ', '…': ' ... ',
	'\ufeff': '', 'करना': '', 'है': ''}
	for s in specials:
	text = text.replace(s, specials[s])

	return text


	def correct_spelling(x, dic):
	'''Corrects common spelling errors'''
	for word in dic.keys():
	x = x.replace(word, dic[word])
	return x


	def remove_space(text):
	'''Removes awkward spaces'''
	# Removes awkward spaces
	text = text.strip()
	text = text.split()
	return " ".join(text)


	def text_preprocessing_pipeline(text):
	'''Cleaning and parsing the text.'''
	text = clean_text(text)
	text = clean_contractions(text, contraction_mapping)
	text = clean_special_chars(text, punct, punct_mapping)
	text = correct_spelling(text, mispell_dict)
	text = remove_space(text)
	return text


	# hexcode
	def hexcode_clean(text):
	text = (r'[^\x00-\x7f]', r'', text)
	return text

	#clean temp dir for files
	def cleandir(path):
	for root, dirs, files in os.walk(path):
	for currentFile in files:
	exts = ('.json', '.csv','.png')
	if currentFile.lower().endswith(exts):
	os.remove(os.path.join(root, currentFile))