Skip to content

Instantly share code, notes, and snippets.

@nithyadurai87
Created December 24, 2018 12:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nithyadurai87/491e5e6f9c009ebd88912e71ef9363a4 to your computer and use it in GitHub Desktop.
Save nithyadurai87/491e5e6f9c009ebd88912e71ef9363a4 to your computer and use it in GitHub Desktop.
"""
import nltk
nltk.download()
"""
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
def lemmatize(token, tag):
if tag[0].lower() in ['n', 'v']:
return WordNetLemmatizer().lemmatize(token, tag[0].lower())
return token
corpus = ['Bird is a Peacock Bird','Peacock dances very well','It eats variety of seeds','Cumin seed was eaten by it once']
print (CountVectorizer().fit_transform(corpus).todense())
print (CountVectorizer(stop_words='english').fit_transform(corpus).todense())
print (PorterStemmer().stem('seeds'))
print (WordNetLemmatizer().lemmatize('gathering', 'v'))
print (WordNetLemmatizer().lemmatize('gathering', 'n'))
s_lines=[]
for document in corpus:
s_words=[]
for token in word_tokenize(document):
s_words.append(PorterStemmer().stem(token))
s_lines.append(s_words)
print ('Stemmed:',s_lines)
tagged_corpus=[]
for document in corpus:
tagged_corpus.append(pos_tag(word_tokenize(document)))
l_lines=[]
for document in tagged_corpus:
l_words=[]
for token, tag in document:
l_words.append(lemmatize(token, tag))
l_lines.append(l_words)
print ('Lemmatized:',l_lines)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment