Skip to content

Instantly share code, notes, and snippets.

@egemenzeytinci
Last active November 26, 2019 21:28
Show Gist options
  • Save egemenzeytinci/b6a12bf30e2ea1dbbf036c9d90b666f1 to your computer and use it in GitHub Desktop.
Save egemenzeytinci/b6a12bf30e2ea1dbbf036c9d90b666f1 to your computer and use it in GitHub Desktop.
Preprocessing steps in python
from nltk.corpus import stopwords
from stemming.porter2 import stem
import nltk
import re
import string
nltk.download('punkt')
nltk.download('stopwords')
default_stopwords = stopwords.words('english')
def preproccessing(text):
# remove html tags
text = re.sub(r'<.*?>', '', text)
# remove the characters [\], ['] and ["]
text = re.sub(r"\\", "", text)
text = re.sub(r"\'", "", text)
text = re.sub(r"\"", "", text)
text = re.sub(r"\d+", "", text)
# convert text to lowercase
text = text.strip().lower()
# replace punctuation characters with spaces
replace_punctuation = str.maketrans(string.punctuation,
' ' * len(string.punctuation))
text = str(text).translate(replace_punctuation)
# stemming (removing ed, es etc.)
stems = [stem(word) for word in text.split(' ')]
# removing stop words
words = [w for w in stems if w not in default_stopwords if w != '']
return ' '.join(map(str, words))
def main():
test = "<html>Added and cutted!!!!!!!!\\\\\'</html>"
processed = preproccessing(test)
assert (processed == 'ad cut')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment