Skip to content

Instantly share code, notes, and snippets.

@satkr7
Last active December 25, 2021 11:48
Show Gist options
  • Save satkr7/7d66e00bc2db9742a6c77cbf206af3f9 to your computer and use it in GitHub Desktop.
Save satkr7/7d66e00bc2db9742a6c77cbf206af3f9 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import os
import gc
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
def preprocess(x):
x=str(x).lower()
x=x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
.replace("€", " euro ").replace("'ll", " will")
x = re.sub(r"([0-9]+)000000", r"lm", x)
x = re.sub(r"([0-9]+)000", r"lk", x)
porter=PorterStemmer()
pattern=re.compile('\W')
if type(x) == type(''):
x = re.sub(pattern, ' ', x)
if type(x) == type(''):
x = porter.stem(x)
example1 = BeautifulSoup(x, 'lxml')
x = example1.get_text()
return x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment