Skip to content

Instantly share code, notes, and snippets.

@shivadvg19
Created October 16, 2017 05:50
Show Gist options
  • Save shivadvg19/60aad0078c8db13fafa55b7758a266ed to your computer and use it in GitHub Desktop.
Save shivadvg19/60aad0078c8db13fafa55b7758a266ed to your computer and use it in GitHub Desktop.
python3 implementation of Bagofwords and TF-IDF models for text processing
''' Feature Extraction using TF-IDF this code is python3 implementation of source code included in Text Analytics with Python'''
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]
new_doc = ['loving this blue sky today']
from sklearn.feature_extraction.text import CountVectorizer
def bow_extractor(corpus, ngram_range=(1,1)):
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print (features)
new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print (new_doc_features)
feature_names = bow_vectorizer.get_feature_names()
print (feature_names)
# Function to display features as a data frame
import pandas as pd
def display_features(features, feature_names):
df = pd.DataFrame(data=features,
columns=feature_names)
print (df)
display_features(features, feature_names)
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
import numpy as np
#from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
# build tfidf transformer and show train corpus tfidf features
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)
nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)
import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()
# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')
# show term frequencies
display_features(tf, feature_names)
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later
display_features([df], feature_names)
# compute inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)
display_features([np.round(idf, 2)], feature_names)
# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()
print (np.round(idf, 2))
# compute tfidf feature matrix
tfidf = tf * idf
# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)
# compute L2 norms
norms = norm(tfidf, axis=1)
# print norms for each document
print("L2 Normal form",np.round(norms, 2))
# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]
# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)
# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')
# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]
# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)
# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the
#raw documents themselves.
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
vectorizer = TfidfVectorizer(min_df=1,
norm='l2',
smooth_idf=True,
use_idf=True,
ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
# build tfidf vectorizer and get training corpus feature vectors
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)
# get tfidf feature vector for the new document
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment