Skip to content

Instantly share code, notes, and snippets.

@emtwo
Created July 29, 2016 15:52
Show Gist options
  • Save emtwo/89ef2f0c5ff96e25daa21794f898bb46 to your computer and use it in GitHub Desktop.
Save emtwo/89ef2f0c5ff96e25daa21794f898bb46 to your computer and use it in GitHub Desktop.
import re
import sqlite3
import os
import unicodedata
import numpy as np
from gensim import utils
from gensim import models
from string import digits
from sklearn import metrics
from pymining import itemmining
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.externals import joblib
from sklearn.cluster import DBSCAN, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
KEYWORDS_TO_URLS = {}
URLS_TO_KEYWORDS = {}
PATH_TO_CORPUS = "corpus"
class IterableSentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for filename in os.listdir(self.dirname):
with open (os.path.join(self.dirname, filename), "r") as f:
yield utils.simple_preprocess(f.read())
def get_filtered_history_visits():
conn = sqlite3.connect('places.sqlite')
c = conn.cursor()
rows = c.execute(
'SELECT DISTINCT visit_date, url, title ' +
'FROM moz_historyvisits ' +
'LEFT JOIN moz_places ' +
'ON moz_historyvisits.place_id = moz_places.id ' +
'WHERE url NOT LIKE "%facebook%" ' +
'AND url NOT LIKE "%google%" ' +
'AND url NOT LIKE "%mozilla%" ' +
'AND url NOT LIKE "%amazon%" ' +
'AND url NOT LIKE "%messenger%" ' +
'AND url NOT LIKE "%.toronto.edu%" ' +
'AND url NOT LIKE "%127.0.0.1%" ' +
'ORDER BY visit_date')
return np.array([(np.array(row[0] / float(1000000000) / float(60)), row[1], row[2] ) for row in rows])
def get_history_visit_keywords(visit):
url = unicodedata.normalize('NFKD', visit[1]).encode('ascii','ignore').translate(None, digits)
title = visit[2]
if title is not None:
title = unicodedata.normalize('NFKD', visit[2]).encode('ascii','ignore').translate(None, digits)
url_keywords = re.split('//|/|=&|=|&|\?|\.|\+|-| ', url)
title_keywords = []
if title is not None:
title_keywords = re.split('//|/|=&|=|&|\?|\.|\+|-| ', title)
keywords = url_keywords + title_keywords
return ' '.join(keywords).split() # Removing empty strings
def map_keywords_to_url(keywords, url):
for keyword in keywords:
if keyword not in KEYWORDS_TO_URLS:
KEYWORDS_TO_URLS[keyword] = set()
KEYWORDS_TO_URLS[keyword].add(url)
def generate_keyword_url_mappings(history):
for visit in history:
url = visit[1]
keywords = get_history_visit_keywords(visit)
map_keywords_to_url(keywords, url)
if url not in URLS_TO_KEYWORDS:
URLS_TO_KEYWORDS[url] = []
URLS_TO_KEYWORDS[url] += keywords
def get_or_generate_model(visits):
try:
kmeans = joblib.load('datetime_clusted_model.pkl')
return kmeans
except:
data = np.array([np.array(visit[0]) for visit in visits])
kmeans = KMeans(init='k-means++', n_clusters=2000, n_init=10)
kmeans.fit(data.reshape(-1, 1))
joblib.dump(kmeans, 'datetime_clusted_model.pkl')
return kmeans
def generate_document_salient_keywords(model, visits):
try:
tf_idf = joblib.load('tf_idf_model.pkl')
features = joblib.load('features.pkl')
return tf_idf, features
except:
CLUSTER_TO_KEYWORDS = {}
labels = model.labels_
for i in xrange(len(set(labels))):
cluster_mask = np.zeros_like(labels, dtype=bool)
cluster_mask[labels == i] = True
CLUSTER_TO_KEYWORDS[i] = ""
for visit in visits[cluster_mask]:
CLUSTER_TO_KEYWORDS[i] += ' '.join(URLS_TO_KEYWORDS[visit[1]])
corpus = CLUSTER_TO_KEYWORDS.values()
vectorizer = TfidfVectorizer(min_df=1)
tf_idf = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names()
joblib.dump(tf_idf, 'tf_idf_model.pkl')
joblib.dump(features, 'features.pkl')
return tf_idf, features
def get_top_salient_words_per_document(tf_idf, features):
indices = np.argsort(-tf_idf.toarray())
top_n = 100
for cluster_id, document_indecies in enumerate(indices):
file_path = 'corpus/cluster_' + str(cluster_id)
if os.path.exists(file_path):
continue
keywords_string = " ".join([features[i] for i in document_indecies[:top_n]])
f = open(file_path, 'w')
f.write(keywords_string)
f.close()
def apply_word2vec():
sentences = IterableSentences(PATH_TO_CORPUS)
model = models.Word2Vec(sentences, min_count=10, size=150, sg=0, iter=10, window=10, workers=6)
print model.most_similar("hotel")
model.save("model")
if __name__ == "__main__":
history_visits = get_filtered_history_visits()
generate_keyword_url_mappings(history_visits)
print "There are " + str(len(history_visits)) + " history visits."
print "There are " + str(len(URLS_TO_KEYWORDS)) + " distinct urls."
clustering_model = get_or_generate_model(history_visits)
tf_idf_model, features = generate_document_salient_keywords(clustering_model, history_visits)
get_top_salient_words_per_document(tf_idf_model, features)
apply_word2vec()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment