Last active
January 2, 2020 17:16
-
-
Save khuyentran1401/ecf2a5b2b1e4c3436bb72e6f172ae511 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download() | |
from nltk.tokenize import RegexpTokenizer | |
tokenizer = RegexpTokenizer(r'\w+') | |
text = tokenizer.tokenize(text) | |
text = ' '.join(word for word in text) | |
# Tokenize words | |
from nltk.tokenize import word_tokenize | |
tokenized_word=word_tokenize(text) | |
# Lowercase | |
tokenized_word = [word.lower() for word in tokenized_word] | |
# Remove stopwords | |
from nltk.corpus import stopwords | |
stop_words = set(stopwords.words('english')) | |
print(stop_words) | |
filtered_word = [] | |
for word in tokenized_word: | |
if word not in stop_words: | |
filtered_word.append(word) | |
# Stemming the word | |
from nltk.stem import PorterStemmer | |
ps = PorterStemmer() | |
stemmed_words = [] | |
for w in filtered_word: | |
stemmed_words.append(ps.stem(w)) | |
#See how stemming works | |
for word in ['thinking', 'felt', 'asked','challenging','devoted']: | |
print(ps.stem(word)) | |
# Try with lemmatization and compare it with stemming | |
from nltk.stem.wordnet import WordNetLemmatizer | |
lem = WordNetLemmatizer() | |
lem_words = [] | |
for w in filtered_word: | |
lem_words.append(lem.lemmatize(w,'v')) | |
for word in ['thinking', 'felt', 'asked','challenging','devoted']: | |
print(lem.lemmatize(word,'v')) | |
from nltk.probability import FreqDist | |
fdist = FreqDist(lem_words) | |
most_common = fdist.most_common(20) | |
most_common | |
pip install matplotlib | |
import matplotlib.pyplot as plt | |
plt.figure(figsize=(20,5)) | |
plt.plot([word[0] for word in most_common], [word[1] for word in most_common]) | |
plt.xlabel('Words') | |
plt.ylabel('Frequency') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment