Skip to content

Instantly share code, notes, and snippets.

@khuyentran1401
Last active January 2, 2020 17:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save khuyentran1401/ecf2a5b2b1e4c3436bb72e6f172ae511 to your computer and use it in GitHub Desktop.
Save khuyentran1401/ecf2a5b2b1e4c3436bb72e6f172ae511 to your computer and use it in GitHub Desktop.
import nltk
nltk.download()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
text = tokenizer.tokenize(text)
text = ' '.join(word for word in text)
# Tokenize words
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
# Lowercase
tokenized_word = [word.lower() for word in tokenized_word]
# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)
filtered_word = []
for word in tokenized_word:
if word not in stop_words:
filtered_word.append(word)
# Stemming the word
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_words = []
for w in filtered_word:
stemmed_words.append(ps.stem(w))
#See how stemming works
for word in ['thinking', 'felt', 'asked','challenging','devoted']:
print(ps.stem(word))
# Try with lemmatization and compare it with stemming
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
lem_words = []
for w in filtered_word:
lem_words.append(lem.lemmatize(w,'v'))
for word in ['thinking', 'felt', 'asked','challenging','devoted']:
print(lem.lemmatize(word,'v'))
from nltk.probability import FreqDist
fdist = FreqDist(lem_words)
most_common = fdist.most_common(20)
most_common
pip install matplotlib
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
plt.plot([word[0] for word in most_common], [word[1] for word in most_common])
plt.xlabel('Words')
plt.ylabel('Frequency')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment