shivadvg19/Bow_tfidf.py

## Bow_tfidf.py
''' Feature Extraction using TF-IDF this code is python3 implementation of source code included in Text Analytics with Python'''

CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]
new_doc = ['loving this blue sky today']

from sklearn.feature_extraction.text import CountVectorizer

def bow_extractor(corpus, ngram_range=(1,1)):
    vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

bow_vectorizer, bow_features = bow_extractor(CORPUS)
features = bow_features.todense()
print (features)

new_doc_features = bow_vectorizer.transform(new_doc)
new_doc_features = new_doc_features.todense()
print (new_doc_features)

feature_names = bow_vectorizer.get_feature_names()
print (feature_names)

# Function to display features as a data frame
import pandas as pd
def display_features(features, feature_names):
    df = pd.DataFrame(data=features,
                      columns=feature_names)
    print (df)

display_features(features, feature_names)

from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_transformer(bow_matrix):
    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix

import numpy as np
#from feature_extractors import tfidf_transformer
feature_names = bow_vectorizer.get_feature_names()
# build tfidf transformer and show train corpus tfidf features
tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
features = np.round(tdidf_features.todense(), 2)
display_features(features, feature_names)

nd_tfidf = tfidf_trans.transform(new_doc_features)
nd_features = np.round(nd_tfidf.todense(), 2)
display_features(nd_features, feature_names)

import scipy.sparse as sp
from numpy.linalg import norm
feature_names = bow_vectorizer.get_feature_names()

# compute term frequency
tf = bow_features.todense()
tf = np.array(tf, dtype='float64')
# show term frequencies
display_features(tf, feature_names)

df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
df = 1 + df # to smoothen idf later

display_features([df], feature_names)


# compute inverse document frequencies
total_docs = 1 + len(CORPUS)
idf = 1.0 + np.log(float(total_docs) / df)

display_features([np.round(idf, 2)], feature_names)

# compute idf diagonal matrix
total_features = bow_features.shape[1]
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
idf = idf_diag.todense()

print (np.round(idf, 2))

# compute tfidf feature matrix
tfidf = tf * idf
# show tfidf feature matrix
display_features(np.round(tfidf, 2), feature_names)

# compute L2 norms
norms = norm(tfidf, axis=1)
# print norms for each document
print("L2 Normal form",np.round(norms, 2))

# compute normalized tfidf
norm_tfidf = tfidf / norms[:, None]
# show final tfidf feature matrix
display_features(np.round(norm_tfidf, 2), feature_names)

# compute new doc term freqs from bow freqs
nd_tf = new_doc_features
nd_tf = np.array(nd_tf, dtype='float64')

# compute tfidf using idf matrix from train corpus
nd_tfidf = nd_tf*idf
nd_norms = norm(nd_tfidf, axis=1)
norm_nd_tfidf = nd_tfidf / nd_norms[:, None]
# show new_doc tfidf feature vector
display_features(np.round(norm_nd_tfidf, 2), feature_names)

# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the
#raw documents themselves.

from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_extractor(corpus, ngram_range=(1,1)):
    vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=ngram_range)
    features = vectorizer.fit_transform(corpus)
    return vectorizer, features

# build tfidf vectorizer and get training corpus feature vectors
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)

# get tfidf feature vector for the new document
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)
	''' Feature Extraction using TF-IDF this code is python3 implementation of source code included in Text Analytics with Python'''

	CORPUS = [
	'the sky is blue',
	'sky is blue and sky is beautiful',
	'the beautiful sky is so blue',
	'i love blue cheese'
	]
	new_doc = ['loving this blue sky today']

	from sklearn.feature_extraction.text import CountVectorizer

	def bow_extractor(corpus, ngram_range=(1,1)):
	vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
	features = vectorizer.fit_transform(corpus)
	return vectorizer, features

	bow_vectorizer, bow_features = bow_extractor(CORPUS)
	features = bow_features.todense()
	print (features)

	new_doc_features = bow_vectorizer.transform(new_doc)
	new_doc_features = new_doc_features.todense()
	print (new_doc_features)

	feature_names = bow_vectorizer.get_feature_names()
	print (feature_names)

	# Function to display features as a data frame
	import pandas as pd
	def display_features(features, feature_names):
	df = pd.DataFrame(data=features,
	columns=feature_names)
	print (df)

	display_features(features, feature_names)

	from sklearn.feature_extraction.text import TfidfTransformer
	def tfidf_transformer(bow_matrix):
	transformer = TfidfTransformer(norm='l2',
	smooth_idf=True,
	use_idf=True)
	tfidf_matrix = transformer.fit_transform(bow_matrix)
	return transformer, tfidf_matrix

	import numpy as np
	#from feature_extractors import tfidf_transformer
	feature_names = bow_vectorizer.get_feature_names()
	# build tfidf transformer and show train corpus tfidf features
	tfidf_trans, tdidf_features = tfidf_transformer(bow_features)
	features = np.round(tdidf_features.todense(), 2)
	display_features(features, feature_names)

	nd_tfidf = tfidf_trans.transform(new_doc_features)
	nd_features = np.round(nd_tfidf.todense(), 2)
	display_features(nd_features, feature_names)

	import scipy.sparse as sp
	from numpy.linalg import norm
	feature_names = bow_vectorizer.get_feature_names()

	# compute term frequency
	tf = bow_features.todense()
	tf = np.array(tf, dtype='float64')
	# show term frequencies
	display_features(tf, feature_names)

	df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr)
	df = 1 + df # to smoothen idf later

	display_features([df], feature_names)


	# compute inverse document frequencies
	total_docs = 1 + len(CORPUS)
	idf = 1.0 + np.log(float(total_docs) / df)

	display_features([np.round(idf, 2)], feature_names)

	# compute idf diagonal matrix
	total_features = bow_features.shape[1]
	idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features)
	idf = idf_diag.todense()

	print (np.round(idf, 2))

	# compute tfidf feature matrix
	tfidf = tf * idf
	# show tfidf feature matrix
	display_features(np.round(tfidf, 2), feature_names)

	# compute L2 norms
	norms = norm(tfidf, axis=1)
	# print norms for each document
	print("L2 Normal form",np.round(norms, 2))

	# compute normalized tfidf
	norm_tfidf = tfidf / norms[:, None]
	# show final tfidf feature matrix
	display_features(np.round(norm_tfidf, 2), feature_names)

	# compute new doc term freqs from bow freqs
	nd_tf = new_doc_features
	nd_tf = np.array(nd_tf, dtype='float64')

	# compute tfidf using idf matrix from train corpus
	nd_tfidf = nd_tf*idf
	nd_norms = norm(nd_tfidf, axis=1)
	norm_nd_tfidf = nd_tfidf / nd_norms[:, None]
	# show new_doc tfidf feature vector
	display_features(np.round(norm_nd_tfidf, 2), feature_names)

	# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the
	#raw documents themselves.

	from sklearn.feature_extraction.text import TfidfVectorizer
	def tfidf_extractor(corpus, ngram_range=(1,1)):
	vectorizer = TfidfVectorizer(min_df=1,
	norm='l2',
	smooth_idf=True,
	use_idf=True,
	ngram_range=ngram_range)
	features = vectorizer.fit_transform(corpus)
	return vectorizer, features

	# build tfidf vectorizer and get training corpus feature vectors
	tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
	display_features(np.round(tdidf_features.todense(), 2), feature_names)

	# get tfidf feature vector for the new document
	nd_tfidf = tfidf_vectorizer.transform(new_doc)
	display_features(np.round(nd_tfidf.todense(), 2), feature_names)