Skip to content

Instantly share code, notes, and snippets.

@amankharwal
Created December 1, 2020 04:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amankharwal/c844508474bec2b0d0724eeb7382bbcf to your computer and use it in GitHub Desktop.
Save amankharwal/c844508474bec2b0d0724eeb7382bbcf to your computer and use it in GitHub Desktop.
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using",
"show", "result", "large",
"also", "one", "two", "three",
"four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))
def pre_process(text):
# lowercase
text=text.lower()
#remove tags
text=re.sub("</?.*?>"," <> ",text)
# remove special characters and digits
text=re.sub("(\\d|\\W)+"," ",text)
##Convert to list from string
text = text.split()
# remove stopwords
text = [word for word in text if word not in stop_words]
# remove words less than three letters
text = [word for word in text if len(word) >= 3]
# lemmatize
lmtzr = WordNetLemmatizer()
text = [lmtzr.lemmatize(word) for word in text]
return ' '.join(text)
docs = df['paper_text'].apply(lambda x:pre_process(x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment