Skip to content

Instantly share code, notes, and snippets.

@dinob0t
Created August 31, 2016 20:09
Show Gist options
  • Save dinob0t/866e2da26de2583a2eaa83af1f29d836 to your computer and use it in GitHub Desktop.
Save dinob0t/866e2da26de2583a2eaa83af1f29d836 to your computer and use it in GitHub Desktop.
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tokens(no_punctuation):
tokens = nltk.word_tokenize(no_punctuation)
return tokens
def remove_stop_words(tokens):
return [w for w in tokens if not w in stopwords.words('english')]
def get_most_common(body, n, remove_stop=True):
tokens = get_tokens(body)
if remove_stop:
tokens = remove_stop_words(tokens)
return Counter(tokens).most_common(n)
tfidf = TfidfVectorizer(tokenizer=get_tokens, stop_words='english')
tds = tfidf.fit_transform(corpus)
idf = tfidf._tfidf.idf_
print sorted(zip(tfidf.get_feature_names(), idf), key=lambda x: x[1], reverse=True)import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
def get_tokens(no_punctuation):
tokens = nltk.word_tokenize(no_punctuation)
return tokens
def remove_stop_words(tokens):
return [w for w in tokens if not w in stopwords.words('english')]
def get_most_common(body, n, remove_stop=True):
tokens = get_tokens(body)
if remove_stop:
tokens = remove_stop_words(tokens)
return Counter(tokens).most_common(n)
tfidf = TfidfVectorizer(tokenizer=get_tokens, stop_words='english')
tds = tfidf.fit_transform(corpus)
idf = tfidf._tfidf.idf_
print sorted(zip(tfidf.get_feature_names(), idf), key=lambda x: x[1], reverse=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment