Skip to content

Instantly share code, notes, and snippets.

Created October 16, 2017 05:50
Show Gist options
  • Save shivadvg19/60aad0078c8db13fafa55b7758a266ed to your computer and use it in GitHub Desktop.
Save shivadvg19/60aad0078c8db13fafa55b7758a266ed to your computer and use it in GitHub Desktop.
python3 implementation of Bagofwords and TF-IDF models for text processing
''' Feature Extraction using TF-IDF this code is python3 implementation of source code included in Text Analytics with Python'''
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
new_doc = ['loving this blue sky today']
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1,1)):
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print (features)
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print (new_doc_features)
feature_names = bow_vectorizer.get_feature_names()
print (feature_names)
# Function to display features as a data frame
import pandas as pd
def display_features(features, feature_names):
df = pd.DataFrame(data=features,
print (df)
display_features(features, feature_names)
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
import numpy as np
#from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
# build tfidf transformer and show train corpus tfidf features
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()
# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')
# show term frequencies
display_features(tf, feature_names)
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later
display_features([df], feature_names)
# compute inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)
display_features([np.round(idf, 2)], feature_names)
# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()
print (np.round(idf, 2))
# compute tfidf feature matrix
tfidf = tf * idf
# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)
# compute L2 norms
norms = norm(tfidf, axis=1)
# print norms for each document
print("L2 Normal form",np.round(norms, 2))
# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]
# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')
# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]
# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)
# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the
#raw documents themselves.
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
vectorizer = TfidfVectorizer(min_df=1,
features = vectorizer.fit_transform(corpus)
return vectorizer, features
# build tfidf vectorizer and get training corpus feature vectors
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)
# get tfidf feature vector for the new document
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment