Skip to content

Instantly share code, notes, and snippets.

@deshwalmahesh
Last active January 8, 2022 10:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deshwalmahesh/69d9d3a5fdf2192f2b280eca4b56d8f1 to your computer and use it in GitHub Desktop.
Save deshwalmahesh/69d9d3a5fdf2192f2b280eca4b56d8f1 to your computer and use it in GitHub Desktop.
Clean the latex. Use Dict with t tokenization method for faster execution
def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):
if not isinstance(sentence,str):
return ''
# convert the characters into lower case
a = sentence.lower()
# remomve newline character
a = re.sub(r"\n+", " ", a)
# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a)
# remove whatever comes after \\ double or single slashes except space
a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API
a = re.sub(r'[^a-zA-Z]',' ',a)
# remove repeated space if there is any
a = re.sub(r"\s+", " ", a)
a = a.strip() # Remove start end spaces
if not len(a):
return ''
tokens = a.split(' ')
if use_lemmetization: # if lemmetize only
tokens = [lemmetizer.lemmatize(token) for token in tokens]
if use_stemming: # stemming only
tokens = [stemmer.stem(token) for token in tokens]
if add_pos:
tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]
if remove_length:
tokens = [x for x in tokens if len(x) > remove_length]
return ' '.join(tokens)
def convert_numbers(x):
if bool(re.search(r'\d', x)):
x = re.sub('[0-9]{5,}', '#####', x)
x = re.sub('[0-9]{4}', '####', x)
x = re.sub('[0-9]{3}', '###', x)
x = re.sub('[0-9]{2}', '##', x)
return x
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
"""
Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
All the non latex characters will remain unaffected.
Args :
input_str : (string) input in string format with latex & non latex characters
to_unicode: Whether to convert string to unicode or not
remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
Returns :
string converted with spaces, new lines & order preserved
"""
if not isinstance(input_str,str):
return ''
input_str = re.sub(r'\triangle',chr(9651), input_str)
input_str = re.sub(r'\\frac','/', input_str)
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
if remove_numbers:
input_str = re.sub(r'[0-9]','',input_str)
if to_unicode:
raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string
input_str = input_str.replace("\n", " !#! ")
input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
input_str = input_str.replace(" !#! ","\n")
return input_str.replace(' ','').lower()
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
if not isinstance(input_str,str):
return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
if remove_numbers:
input_str = re.sub(r'[0-9]','',input_str)
if to_unicode:
input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line
input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
input_str = re.sub(r'\\underline',chr(717), input_str)
input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
input_str = re.sub(r'\\sqrt',chr(8730), input_str)
input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
input_str = re.sub(r'\\triangle',chr(9651), input_str)
input_str = re.sub(r'\\frac','/', input_str)
input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used
# Trigo Symbols
input_str = re.sub(r'\\sinh',chr(525), input_str)
input_str = re.sub(r'\\sin',chr(524), input_str)
input_str = re.sub(r'\\cosh',chr(527), input_str)
input_str = re.sub(r'\\cos',chr(526), input_str)
input_str = re.sub(r'\\tanh',chr(555), input_str)
input_str = re.sub(r'\\tan',chr(554), input_str)
input_str = re.sub(r'\\cot',chr(556), input_str)
input_str = re.sub(r'\\sec',chr(557), input_str)
input_str = re.sub(r'\\csc',chr(558), input_str)
input_str = re.sub(r'\\arcsin',chr(559), input_str)
input_str = re.sub(r'\\arccos',chr(560), input_str)
input_str = re.sub(r'\\arctan',chr(561), input_str)
input_str = LatexNodes2Text().latex_to_text(input_str)
return input_str.replace('\n','').replace(' ','').lower()
def top_n(pipeline, x_test, y_test, n = 5):
probs = pipeline.predict_proba(x_test)
topn = np.argsort(probs, axis = 1)[:,-n:]
return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector
def load_embedding_dict(path:str, skip_first_line:bool = False):
'''
Load Embeddings as a Dictonary from the file
args:
path: Path to the file
skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
out:
Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
'''
with open(path) as f:
lines = f.readlines()
return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])
def sentence_to_mean_vect(sentence:str):
'''
Create a mean feature from all the words in the sentence
'''
feat = []
for word in sentence.split(' '):
val = embeddings_dict.get(word)
feat.append(val) if val is not None else feat.append(np.zeros(300))
return np.mean(feat,axis=0)
def cross_entropy(predictions, targets):
N = predictions.shape[0]
ce = -np.sum(targets * np.log(predictions)) / N
return ce
import pandas as pd
import re
import numpy as np
def fetch_old_data(path, subject):
maths = pd.read_csv(path)
maths = maths[maths['Subject'] == subject]
maths.rename(columns = {'eng':'text', 'q_id':'_id','chapter':'CHAPTER','Subject':'SUBJECT'}, inplace = True)
maths = maths.iloc[:,:-2]
return maths
def new_data_merge(path, sheet_num):
bio = pd.read_excel(path, sheet_num)
bio.rename(columns = {'chapter':'CHAPTER', 'Subject':'SUBJECT','crop_url':'question_url'}, inplace = True)
if '_id' not in bio.columns:
bio['_id'] = bio['question_url'].apply(lambda x: x.split('/')[-2])
bio.drop_duplicates(subset=['text'],inplace=True)
bio = bio[~((bio['text'].isna())&(bio['latex'].isna()))] # Drop empty
bio = bio[~bio['Done on'].isna()] # Get those which have been done only
bio = bio[bio['Problems? notPCMB, noText, can\'tPredictChapter'].isna()] # If there is not any problems
bio['CHAPTER'] = bio['CHAPTER'].apply(lambda x: ref[x]) # Map to NCERT Chapter Names
for index in bio.index: # Map to Correct Chapter
if not pd.isna(bio.loc[index,'Correct Chapter ']):
bio.loc[index,'CHAPTER'] = bio.loc[index,'Correct Chapter ']
if not pd.isna(bio.loc[index,'Correct Subject']):
bio.loc[index,'SUBJECT'] = bio.loc[index,'Correct Subject']
return bio
def process(sentence,use_lemmetization:False, use_stemming:False, add_pos: False, remove_length:bool = False):
if not isinstance(sentence,str):
return ''
# convert the characters into lower case
a = sentence.lower()
# remomve newline character
a = re.sub(r"\n+", " ", a)
# remove MathPix markdown starting from \( and ending at \) while preserving data inside \text { preserve this }
a = re.sub(r'\s*\\+\((.*?)\\+\)', lambda x: " ".join(re.findall(r'\\text\s*{([^{}]*)}', x.group(1))), a)
# remove whatever comes after \\ double or single slashes except space
a = re.sub(r"(\\[^ ]+)",' ',a) # makes sense. Just in case you enounter old mathpix API
a = re.sub(r'[^a-zA-Z]',' ',a)
# remove repeated space if there is any
a = re.sub(r"\s+", " ", a)
a = a.strip() # Remove start end spaces
if not len(a):
return ''
tokens = a.split(' ')
if use_lemmetization: # if lemmetize only
tokens = [lemmetizer.lemmatize(token) for token in tokens]
if use_stemming: # stemming only
tokens = [stemmer.stem(token) for token in tokens]
if add_pos:
tokens = [token+"_"+tag for token, tag in nltk.pos_tag(tokens)]
if remove_length:
tokens = [x for x in tokens if len(x) > remove_length]
return ' '.join(tokens)
def convert_numbers(x):
if bool(re.search(r'\d', x)):
x = re.sub('[0-9]{5,}', '#####', x)
x = re.sub('[0-9]{4}', '####', x)
x = re.sub('[0-9]{3}', '###', x)
x = re.sub('[0-9]{2}', '##', x)
return x
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
"""
Function that converts input latex parts of a string to its unicode format eg. \\frac {3 x } {2 } -> 3 x / 2
All the non latex characters will remain unaffected.
Args :
input_str : (string) input in string format with latex & non latex characters
to_unicode: Whether to convert string to unicode or not
remove_numbers: Whether to remove numbers or not. Numbers might cause redundancy instead of adding much info for classification as 123 have sme significance as 10.24
Returns :
string converted with spaces, new lines & order preserved
"""
if not isinstance(input_str,str):
return ''
input_str = re.sub(r'\triangle',chr(9651), input_str)
input_str = re.sub(r'\\frac','/', input_str)
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
if remove_numbers:
input_str = re.sub(r'[0-9]','',input_str)
if to_unicode:
raw_map = {8:r'\b', 7:r'\a', 12:r'\f', 10:r'\n', 13:r'\r', 9:r'\t', 11:r'\v'} # Constants to convert string escape characters to raw string
input_str = input_str.replace("\n", " !#! ")
input_str = LatexNodes2Text().latex_to_text(r''.join(i if ord(i) > 32 else raw_map.get(ord(i), i) for i in input_str))
input_str = input_str.replace(" !#! ","\n")
return input_str.replace(' ','').lower()
def convert_latex(input_str:str, to_unicode:bool = True, remove_numbers:bool = True):
if not isinstance(input_str,str):
return '' # Whether it is a simple int (because when there is pure number in text, it is considered as int) or float (np.nan), then return empty string
input_str = re.sub(r'\\text\s*{([^{}]*)}','',input_str) # Remove whatever is inside \text
if remove_numbers:
input_str = re.sub(r'[0-9]','',input_str)
if to_unicode:
input_str = re.sub(r'\\bigodot',chr(416), input_str) # looks like a unique O
input_str = re.sub(r'\\hline',chr(713), input_str) # Horizontal Line
input_str = re.sub(r'\\overline',chr(727), input_str) # Horizontal Straight
input_str = re.sub(r'\\underline',chr(717), input_str)
input_str = re.sub(r'\\overbrace',chr(752), input_str) # Arrow pointing upward
input_str = re.sub(r'\\underbrace',chr(751), input_str) # Arrow pointing Downward
input_str = re.sub(r'\\overrightarrow',chr(754), input_str) # Arrow pointing right
input_str = re.sub(r'\\longdiv',chr(10188), input_str) # long Division
input_str = re.sub(r'\\jmath',chr(567), input_str) # dotless J
input_str = re.sub(r'\\imath',chr(305), input_str) # dotless I
input_str = re.sub(r'\\sqrt',chr(8730), input_str)
input_str = re.sub(r'\\Re',chr(344), input_str) # Looks like R symbol
input_str = re.sub(r'\\triangle',chr(9651), input_str)
input_str = re.sub(r'\\frac','/', input_str)
input_str = re.sub(r'\\widetilde',chr(771), input_str) # for \\tilde : 771
input_str = re.sub(r'\\widehat',chr(770), input_str) # for \\hat : 770
input_str = re.sub(r'\\Varangle',chr(8736), input_str) # Just like angle
input_str = re.sub(r'\\neg',chr(172), input_str) # Negation Sign
input_str = re.sub(r'\\begin',chr(705), input_str) # Random Begin Symbol
input_str = re.sub(r'\\end',chr(704), input_str) # Random end Symbol
input_str = re.sub(r'\\min',chr(707), input_str) # Random Begin Symbol
input_str = re.sub(r'\\max',chr(706), input_str) # Random end Symbol
input_str = re.sub(r'\\exp',chr(281), input_str) # Exponential
input_str = re.sub(r'\\lg',chr(315), input_str) # Binary Logrithm ---- UNIQUE case
input_str = re.sub(r'\\ln',chr(317), input_str) # Natural Logrithm e
input_str = re.sub(r'\\log',chr(319), input_str) # Base Log 10
input_str = re.sub(r'\\lim',chr(321), input_str) # Limit
input_str = re.sub(r'\\arg',chr(478), input_str) # Random char to represent ARG
input_str = re.sub(r'\\S$',chr(167), input_str) # Section Symbol. It can create problem for \Sigma to $ termination is used
# Trigo Symbols
input_str = re.sub(r'\\sinh',chr(525), input_str)
input_str = re.sub(r'\\sin',chr(524), input_str)
input_str = re.sub(r'\\cosh',chr(527), input_str)
input_str = re.sub(r'\\cos',chr(526), input_str)
input_str = re.sub(r'\\tanh',chr(555), input_str)
input_str = re.sub(r'\\tan',chr(554), input_str)
input_str = re.sub(r'\\cot',chr(556), input_str)
input_str = re.sub(r'\\sec',chr(557), input_str)
input_str = re.sub(r'\\csc',chr(558), input_str)
input_str = re.sub(r'\\arcsin',chr(559), input_str)
input_str = re.sub(r'\\arccos',chr(560), input_str)
input_str = re.sub(r'\\arctan',chr(561), input_str)
input_str = LatexNodes2Text().latex_to_text(input_str)
return input_str.replace('\n','').replace(' ','').lower()
def top_n(pipeline, x_test, y_test, n = 5):
probs = pipeline.predict_proba(x_test)
topn = np.argsort(probs, axis = 1)[:,-n:]
return np.mean(np.array([1 if y_test[k] in topn[k] else 0 for k in range(len(topn))]))
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') # return word , vector_array from word, list_of_vector
def load_embedding_dict(path:str, skip_first_line:bool = False):
'''
Load Embeddings as a Dictonary from the file
args:
path: Path to the file
skip_first_line: Some files like Word2Vec has first file as [Words Dimension] so need to skip the first line in order to load
out:
Returns a Dictonary of words and it's Vectors as {'word': [0.3,0.1.....]}
'''
with open(path) as f:
lines = f.readlines()
return dict(get_coefs(*o.split(" ")) for o in lines[skip_first_line:])
def sentence_to_mean_vect(sentence:str):
'''
Create a mean feature from all the words in the sentence
'''
feat = []
for word in sentence.split(' '):
val = embeddings_dict.get(word)
feat.append(val) if val is not None else feat.append(np.zeros(300))
return np.mean(feat,axis=0)
def cross_entropy(predictions, targets):
N = predictions.shape[0]
ce = -np.sum(targets * np.log(predictions)) / N
return ce
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
mispell_dict = {"aren't" : "are not","can't" : "cannot","couldn't" : "could not","couldnt" : "could not","didn't" : "did not","doesn't" : "does not","doesnt" : "does not","don't" : "do not","hadn't" : "had not","hasn't" : "has not","haven't" : "have not","havent" : "have not","he'd" : "he would","he'll" : "he will","he's" : "he is","i'd" : "I would","i'll" : "I will","i'm" : "I am","isn't" : "is not","it's" : "it is","it'll":"it will","i've" : "I have","let's" : "let us","mightn't" : "might not","mustn't" : "must not","shan't" : "shall not","she'd" : "she would","she'll" : "she will","she's" : "she is","shouldn't" : "should not","shouldnt" : "should not","that's" : "that is","thats" : "that is","there's" : "there is","theres" : "there is","they'd" : "they would","they'll" : "they will","they're" : "they are","theyre": "they are","they've" : "they have","we'd" : "we would","we're" : "we are","weren't" : "were not","we've" : "we have","what'll" : "what will","what're" : "what are","what's" : "what is","what've" : "what have","where's" : "where is","who'd" : "who would","who'll" : "who will","who're" : "who are","who's" : "who is","who've" : "who have","won't" : "will not","wouldn't" : "would not","you'd" : "you would","you'll" : "you will","you're" : "you are","you've" : "you have","'re": " are","wasn't": "was not","we'll":" will","tryin'":"trying"}
def _get_mispell(mispell_dict):
mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
return mispell_dict, mispell_re
def replace_contractions(text):
mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace(match):
return mispellings[match.group(0)]
return mispellings_re.sub(replace, text)
def insert_space_fun(sentence):
'''
Add a space around special characters So "x+y +-=y \\latex" becomes: "x + y + - = y \\latex"
'''
hindi = range(2304, 2425)
gurumukhi = range(2561,2679)
arabic = range(1542,1792)
mandarian = range(19968,40944)
bangla = range(2432,2556)
string = ''
for i in sentence:
if (ord(i) in hindi) or (ord(i) in gurumukhi) or (ord(i) in arabic) or (ord(i) in mandarian) or (ord(i) in bangla): # used in fill in the blanks ----
string += ' '
elif i == '\\':
string += ' '+i
elif (not i.isalnum()) and (i not in [' ','\\','.']):
string += ' '+i+' '
else:
string += i
return string
def insert_space_re(sentence):
sentence = re.sub(r'(?<! )(?![.a-zA-Z \\])', ' ', sentence)
sentence = re.sub(r'(?<!^)(?<![.a-zA-Z \\])(?! )', ' ', sentence) # preserve abbrevations but will harm sentence seprator
return sentence
def clean_text_latex(string, remove_stop = False, stop_words = None, remove_single_length = False, remove_special = False, special_replacement = ' SPL ', number_replacement = ' NUM '): # for both
if not isinstance(string, str):
return ''
string = re.sub(r'[\n\r\t\u200b\x96]',' ', string) # All characters means Empty Space
string = re.sub(r'\b((?:[A-Z]\.)+)\.?|\.', lambda x: x.group(1).replace('.', '') if x.group(1) else ' ', string) # Get the Abbrevations M.I.T -> MIT, I.I.T. -> IIT
string = string.lower()
string = re.sub('\d+(?:\.\d+)?', number_replacement, string) # for Numbers and decimal numbers # (\d*\.)?\d+
# sp_basic = r'a-zA-Z\\ \^+/%><=*-' #r'a-zA-Z\\ \^+/><-=%'
# allowed_sp = re.compile('[^'+sp_basic+']')
# string = replace_contractions(string) # Remove contractions they're built to work with smallcase should be done before Special Character Removal
# string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(allowed_sp,' ', string) #
string = replace_contractions(string) # Remove contractions
string = re.sub(r"[^a-zA-Z\\ ]", special_replacement, string) if remove_special else re.sub(r"[^a-zA-Z \\.\^+~:/'|%><₹*ા]",' ', string) # Special Characters
string = re.sub(r'\\\s+', ' ', string)
string = insert_space_fun(string)
string = re.sub(r'\s+', ' ',string) # Remove Extra Space
if remove_stop or remove_single_length:
temp = []
for word in string.split(' '):
if remove_stop and (word in stop_words):
continue
elif (remove_single_length) and (len(word) < 2) and (word.isalpha()):
continue
else:
temp.append(word)
string = ' '.join(temp)
return string
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment