Created
October 16, 2017 05:50
-
-
Save shivadvg19/60aad0078c8db13fafa55b7758a266ed to your computer and use it in GitHub Desktop.
python3 implementation of Bagofwords and TF-IDF models for text processing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' Feature Extraction using TF-IDF this code is python3 implementation of source code included in Text Analytics with Python''' | |
CORPUS = [ | |
'the sky is blue', | |
'sky is blue and sky is beautiful', | |
'the beautiful sky is so blue', | |
'i love blue cheese' | |
] | |
new_doc = ['loving this blue sky today'] | |
from sklearn.feature_extraction.text import CountVectorizer | |
def bow_extractor(corpus, ngram_range=(1,1)): | |
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range) | |
features = vectorizer.fit_transform(corpus) | |
return vectorizer, features | |
bow_vectorizer, bow_features = bow_extractor(CORPUS) | |
features = bow_features.todense() | |
print (features) | |
new_doc_features = bow_vectorizer.transform(new_doc) | |
new_doc_features = new_doc_features.todense() | |
print (new_doc_features) | |
feature_names = bow_vectorizer.get_feature_names() | |
print (feature_names) | |
# Function to display features as a data frame | |
import pandas as pd | |
def display_features(features, feature_names): | |
df = pd.DataFrame(data=features, | |
columns=feature_names) | |
print (df) | |
display_features(features, feature_names) | |
from sklearn.feature_extraction.text import TfidfTransformer | |
def tfidf_transformer(bow_matrix): | |
transformer = TfidfTransformer(norm='l2', | |
smooth_idf=True, | |
use_idf=True) | |
tfidf_matrix = transformer.fit_transform(bow_matrix) | |
return transformer, tfidf_matrix | |
import numpy as np | |
#from feature_extractors import tfidf_transformer | |
feature_names = bow_vectorizer.get_feature_names() | |
# build tfidf transformer and show train corpus tfidf features | |
tfidf_trans, tdidf_features = tfidf_transformer(bow_features) | |
features = np.round(tdidf_features.todense(), 2) | |
display_features(features, feature_names) | |
nd_tfidf = tfidf_trans.transform(new_doc_features) | |
nd_features = np.round(nd_tfidf.todense(), 2) | |
display_features(nd_features, feature_names) | |
import scipy.sparse as sp | |
from numpy.linalg import norm | |
feature_names = bow_vectorizer.get_feature_names() | |
# compute term frequency | |
tf = bow_features.todense() | |
tf = np.array(tf, dtype='float64') | |
# show term frequencies | |
display_features(tf, feature_names) | |
df = np.diff(sp.csc_matrix(bow_features, copy=True).indptr) | |
df = 1 + df # to smoothen idf later | |
display_features([df], feature_names) | |
# compute inverse document frequencies | |
total_docs = 1 + len(CORPUS) | |
idf = 1.0 + np.log(float(total_docs) / df) | |
display_features([np.round(idf, 2)], feature_names) | |
# compute idf diagonal matrix | |
total_features = bow_features.shape[1] | |
idf_diag = sp.spdiags(idf, diags=0, m=total_features, n=total_features) | |
idf = idf_diag.todense() | |
print (np.round(idf, 2)) | |
# compute tfidf feature matrix | |
tfidf = tf * idf | |
# show tfidf feature matrix | |
display_features(np.round(tfidf, 2), feature_names) | |
# compute L2 norms | |
norms = norm(tfidf, axis=1) | |
# print norms for each document | |
print("L2 Normal form",np.round(norms, 2)) | |
# compute normalized tfidf | |
norm_tfidf = tfidf / norms[:, None] | |
# show final tfidf feature matrix | |
display_features(np.round(norm_tfidf, 2), feature_names) | |
# compute new doc term freqs from bow freqs | |
nd_tf = new_doc_features | |
nd_tf = np.array(nd_tf, dtype='float64') | |
# compute tfidf using idf matrix from train corpus | |
nd_tfidf = nd_tf*idf | |
nd_norms = norm(nd_tfidf, axis=1) | |
norm_nd_tfidf = nd_tfidf / nd_norms[:, None] | |
# show new_doc tfidf feature vector | |
display_features(np.round(norm_nd_tfidf, 2), feature_names) | |
# implement a generic function that can directly compute the tfidf-based feature vectors for documents from the | |
#raw documents themselves. | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def tfidf_extractor(corpus, ngram_range=(1,1)): | |
vectorizer = TfidfVectorizer(min_df=1, | |
norm='l2', | |
smooth_idf=True, | |
use_idf=True, | |
ngram_range=ngram_range) | |
features = vectorizer.fit_transform(corpus) | |
return vectorizer, features | |
# build tfidf vectorizer and get training corpus feature vectors | |
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS) | |
display_features(np.round(tdidf_features.todense(), 2), feature_names) | |
# get tfidf feature vector for the new document | |
nd_tfidf = tfidf_vectorizer.transform(new_doc) | |
display_features(np.round(nd_tfidf.todense(), 2), feature_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment