Skip to content

Instantly share code, notes, and snippets.

@aurora1625
Last active March 27, 2022 23:26
Show Gist options
  • Save aurora1625/d3de56f1fa00582e3ae355d6ede15b4b to your computer and use it in GitHub Desktop.
Save aurora1625/d3de56f1fa00582e3ae355d6ede15b4b to your computer and use it in GitHub Desktop.
#text preprocessing
import re
from sklearn.feature_extraction import text
stopwords = list(text.ENGLISH_STOP_WORDS)
def preprocess(ss):
# no email
ss = re.sub(r"([a-zA-Z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+\/=?^_`{|}~-]+)*(@)(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(\.|\sdot\s))+[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)", '', ss)
# take care of seat
ss = re.sub('SEATS[ ]{0,1}\d{1,2}[A-K]{2,3}', ' ', ss, flags=re.I)
ss = re.sub('SEAT[ ]{0,1}\d{1,2}[A-K]', ' ', ss, flags=re.I)
# no PNR
ss = re.sub(r'[A-Z0-9]{6}', ' ', ss)
# no flight number
ss = re.sub(r'(SQ|MI)[ ]{0,1}[\d]{1,4}', ' ', ss)
# no number
ss = re.sub('\S*\d+\S*', ' ', ss)
# punctuation
ss = re.sub(r'[^\w]', ' ', ss)
ss = ' '.join(ss.split())
# strip_multiple_whitespaces
ss = re.sub(r"(\s|\\n|\\r|\\t)+", " ", ss)
# lower should be the end of process pipeline, otherwise PNR cannot be deleted
ss = ss.lower()
# lemma
# words = ss.split(' ')
# ss = ' '.join([wnl.lemmatize(word) for word in words])
# remove stop words
ss = " ".join(w for w in ss.split() if w not in stopwords)
return ss
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment