Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save madawei2699/03b777db40b5461b05d94bc99e29f878 to your computer and use it in GitHub Desktop.
Save madawei2699/03b777db40b5461b05d94bc99e29f878 to your computer and use it in GitHub Desktop.
TF-IDF计算网页相似度
import gensim
from gensim import corpora,models
from gensim.matutils import corpus2dense
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import datetime
import random
root_dir = './mySpider/'
def get_documents(names):
documents = []
for f in names:
with open(root_dir + 'text/' + f + '.html.txt', 'r') as f:
documents.append(f.read())
return [[word for word in document.lower().split()] for document in documents]
def get_urls():
url_list = []
with open(root_dir + 'urls.txt', 'r') as f:
url_list.extend(f.readlines())
return [u.replace('\n', '').split('/')[-2] for u in url_list]
def calc_tfidf(texts):
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]
corpus_matrix=corpus2dense(corpus_tfidf, len(dictionary))
return np.asmatrix(corpus_matrix.T)
def get_index(value):
index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
for idx, val in enumerate(index_label):
if value <= val:
return idx
return -1
def calc_cosine_similarity(tfidf, colums):
rows = []
index_label = [float("%0.2f"%i) for i in np.linspace(0.1,1,10)]
count_list = [0] * 10
tfidf_list = []
for x in range(len(tfidf)):
cols = []
for y in range(len(tfidf)):
if y > x:
sim = cosine_similarity(tfidf[x], tfidf[y])
cols.append(sim[0][0])
idx = get_index(sim[0][0])
if idx != -1:
count_list[idx] += 1
tfidf_list.append(sim[0][0])
elif y == x:
cols.append(1)
else:
cols.append(rows[y][x])
rows.append(cols)
print("time: [" + str(datetime.datetime.now()) + "], x: " + str(x) + ", sim: " + str(rows[x]))
return pd.DataFrame(data=rows, index=colums, columns=colums), count_list, index_label, tfidf_list
def draw_heatmap(df):
# Generate a mask for the upper triangle
mask = np.zeros_like(df, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(30, 30))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(df, mask=mask, cmap=cmap, vmax=1, center=0,
square=True, linewidths=.1, cbar_kws={"shrink": .05})
plt.savefig(root_dir + 'hasoffer_heatmap.png')
def draw_barplot(data, columns):
fig, ax = plt.subplots()
df = pd.DataFrame({"counts": data, "sim": columns})
sns.set(style="whitegrid")
ax = sns.barplot(x="sim", y="counts", data=df)
plt.savefig(root_dir + 'hasoffer_barplot.png')
def draw_histogram(data):
fig, ax = plt.subplots()
sns.set(color_codes=True)
sns.kdeplot(data, shade=True, cut=0)
sns.rugplot(data)
plt.savefig(root_dir + 'hasoffer_histogram.png')
names = random.sample(get_urls(), 1300)
df, counts, hist_columns, tfidf_list = calc_cosine_similarity(calc_tfidf(get_documents(names)), names)
df.to_csv(root_dir + 'hasoffer_page_simular.csv')
draw_heatmap(df)
draw_barplot(counts, hist_columns)
draw_histogram(tfidf_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment