Last active
November 26, 2019 21:28
-
-
Save egemenzeytinci/b6a12bf30e2ea1dbbf036c9d90b666f1 to your computer and use it in GitHub Desktop.
Preprocessing steps in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
from stemming.porter2 import stem | |
import nltk | |
import re | |
import string | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
default_stopwords = stopwords.words('english') | |
def preproccessing(text): | |
# remove html tags | |
text = re.sub(r'<.*?>', '', text) | |
# remove the characters [\], ['] and ["] | |
text = re.sub(r"\\", "", text) | |
text = re.sub(r"\'", "", text) | |
text = re.sub(r"\"", "", text) | |
text = re.sub(r"\d+", "", text) | |
# convert text to lowercase | |
text = text.strip().lower() | |
# replace punctuation characters with spaces | |
replace_punctuation = str.maketrans(string.punctuation, | |
' ' * len(string.punctuation)) | |
text = str(text).translate(replace_punctuation) | |
# stemming (removing ed, es etc.) | |
stems = [stem(word) for word in text.split(' ')] | |
# removing stop words | |
words = [w for w in stems if w not in default_stopwords if w != ''] | |
return ' '.join(map(str, words)) | |
def main(): | |
test = "<html>Added and cutted!!!!!!!!\\\\\'</html>" | |
processed = preproccessing(test) | |
assert (processed == 'ad cut') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment