madawei2699/calculates-similarity-distribution-of-website-by-tf-idf.py

## calculates-similarity-distribution-of-website-by-tf-idf.py
import gensim
from gensim import corpora,models
from gensim.matutils import corpus2dense
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime
import random
root_dir = './mySpider/'
def get_documents(names):
    documents = []
    for f in names:
        with open(root_dir + 'text/' + f + '.html.txt', 'r') as f:
            documents.append(f.read())
    return [[word for word in document.lower().split()] for document in documents]
def get_urls():
    url_list = []
    with open(root_dir + 'urls.txt', 'r') as f:
        url_list.extend(f.readlines())
    return [u.replace('\n', '').split('/')[-2] for u in url_list]
def calc_tfidf(texts):
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    corpus_matrix=corpus2dense(corpus_tfidf, len(dictionary))
    return np.asmatrix(corpus_matrix.T)
def get_index(value):
    index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
    for idx, val in enumerate(index_label):
        if value <= val:
            return idx
    return -1
def calc_cosine_similarity(tfidf, colums):
    rows = []
    index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
    count_list = [0] * 10
    tfidf_list = []
    for x in range(len(tfidf)):
        cols = []
        for y in range(len(tfidf)):
            if y > x:
                sim = cosine_similarity(tfidf[x], tfidf[y])
                cols.append(sim[0][0])
                idx = get_index(sim[0][0])
                if idx != -1:
                    count_list[idx] += 1
                tfidf_list.append(sim[0][0])
            elif y == x:
                cols.append(1)
            else:
                cols.append(rows[y][x])
        rows.append(cols)
        print("time: [" + str(datetime.datetime.now()) + "], x: " + str(x) + ", sim: " + str(rows[x]))
    return pd.DataFrame(data=rows, index=colums, columns=colums), count_list, index_label, tfidf_list
def draw_heatmap(df):
    # Generate a mask for the upper triangle
    mask = np.zeros_like(df, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(30, 30))
    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(df, mask=mask, cmap=cmap, vmax=1, center=0,
                square=True, linewidths=.1, cbar_kws={"shrink": .05})
    plt.savefig(root_dir + 'hasoffer_heatmap.png')
def draw_barplot(data, columns):
    fig, ax = plt.subplots()
    df = pd.DataFrame({"counts": data, "sim": columns})
    sns.set(style="whitegrid")
    ax = sns.barplot(x="sim", y="counts", data=df)
    plt.savefig(root_dir + 'hasoffer_barplot.png')
def draw_histogram(data):
    fig, ax = plt.subplots()
    sns.set(color_codes=True)
    sns.kdeplot(data, shade=True, cut=0)
    sns.rugplot(data)
    plt.savefig(root_dir + 'hasoffer_histogram.png')
names = random.sample(get_urls(), 1300)
df, counts, hist_columns, tfidf_list = calc_cosine_similarity(calc_tfidf(get_documents(names)), names)
df.to_csv(root_dir + 'hasoffer_page_simular.csv')
draw_heatmap(df)
draw_barplot(counts, hist_columns)
draw_histogram(tfidf_list)
	import gensim
	from gensim import corpora,models
	from gensim.matutils import corpus2dense
	from sklearn.metrics.pairwise import cosine_similarity
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import os
	import datetime
	import random
	root_dir = './mySpider/'
	def get_documents(names):
	documents = []
	for f in names:
	with open(root_dir + 'text/' + f + '.html.txt', 'r') as f:
	documents.append(f.read())
	return [[word for word in document.lower().split()] for document in documents]
	def get_urls():
	url_list = []
	with open(root_dir + 'urls.txt', 'r') as f:
	url_list.extend(f.readlines())
	return [u.replace('\n', '').split('/')[-2] for u in url_list]
	def calc_tfidf(texts):
	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]
	tfidf_model = models.TfidfModel(corpus)
	corpus_tfidf = tfidf_model[corpus]
	corpus_matrix=corpus2dense(corpus_tfidf, len(dictionary))
	return np.asmatrix(corpus_matrix.T)
	def get_index(value):
	index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
	for idx, val in enumerate(index_label):
	if value <= val:
	return idx
	return -1
	def calc_cosine_similarity(tfidf, colums):
	rows = []
	index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
	count_list = [0] * 10
	tfidf_list = []
	for x in range(len(tfidf)):
	cols = []
	for y in range(len(tfidf)):
	if y > x:
	sim = cosine_similarity(tfidf[x], tfidf[y])
	cols.append(sim[0][0])
	idx = get_index(sim[0][0])
	if idx != -1:
	count_list[idx] += 1
	tfidf_list.append(sim[0][0])
	elif y == x:
	cols.append(1)
	else:
	cols.append(rows[y][x])
	rows.append(cols)
	print("time: [" + str(datetime.datetime.now()) + "], x: " + str(x) + ", sim: " + str(rows[x]))
	return pd.DataFrame(data=rows, index=colums, columns=colums), count_list, index_label, tfidf_list
	def draw_heatmap(df):
	# Generate a mask for the upper triangle
	mask = np.zeros_like(df, dtype=np.bool)
	mask[np.triu_indices_from(mask)] = True
	# Set up the matplotlib figure
	f, ax = plt.subplots(figsize=(30, 30))
	# Generate a custom diverging colormap
	cmap = sns.diverging_palette(220, 10, as_cmap=True)
	# Draw the heatmap with the mask and correct aspect ratio
	sns.heatmap(df, mask=mask, cmap=cmap, vmax=1, center=0,
	square=True, linewidths=.1, cbar_kws={"shrink": .05})
	plt.savefig(root_dir + 'hasoffer_heatmap.png')
	def draw_barplot(data, columns):
	fig, ax = plt.subplots()
	df = pd.DataFrame({"counts": data, "sim": columns})
	sns.set(style="whitegrid")
	ax = sns.barplot(x="sim", y="counts", data=df)
	plt.savefig(root_dir + 'hasoffer_barplot.png')
	def draw_histogram(data):
	fig, ax = plt.subplots()
	sns.set(color_codes=True)
	sns.kdeplot(data, shade=True, cut=0)
	sns.rugplot(data)
	plt.savefig(root_dir + 'hasoffer_histogram.png')
	names = random.sample(get_urls(), 1300)
	df, counts, hist_columns, tfidf_list = calc_cosine_similarity(calc_tfidf(get_documents(names)), names)
	df.to_csv(root_dir + 'hasoffer_page_simular.csv')
	draw_heatmap(df)
	draw_barplot(counts, hist_columns)
	draw_histogram(tfidf_list)