Created
October 31, 2020 14:34
-
-
Save elyasha/b40bae96570fc70cdddcde5056333258 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# regex for removing punctuation! | |
import re | |
# nltk preprocessing magic | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
# grabbing a part of speech function: | |
from part_of_speech import get_part_of_speech | |
text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed." | |
cleaned = re.sub('\W+', ' ', text) | |
tokenized = word_tokenize(cleaned) | |
stemmer = PorterStemmer() | |
stemmed = [stemmer.stem(token) for token in tokenized] | |
## -- CHANGE these -- ## | |
lemmatizer = WordNetLemmatizer() | |
lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized] | |
print("Stemmed text:") | |
print(stemmed) | |
print("\nLemmatized text:") | |
print(lemmatized) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment