Created
July 29, 2016 15:52
-
-
Save emtwo/89ef2f0c5ff96e25daa21794f898bb46 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sqlite3 | |
import os | |
import unicodedata | |
import numpy as np | |
from gensim import utils | |
from gensim import models | |
from string import digits | |
from sklearn import metrics | |
from pymining import itemmining | |
import matplotlib.pyplot as plt | |
import matplotlib.dates as mdates | |
from sklearn.externals import joblib | |
from sklearn.cluster import DBSCAN, KMeans | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
KEYWORDS_TO_URLS = {} | |
URLS_TO_KEYWORDS = {} | |
PATH_TO_CORPUS = "corpus" | |
class IterableSentences(object): | |
def __init__(self, dirname): | |
self.dirname = dirname | |
def __iter__(self): | |
for filename in os.listdir(self.dirname): | |
with open (os.path.join(self.dirname, filename), "r") as f: | |
yield utils.simple_preprocess(f.read()) | |
def get_filtered_history_visits(): | |
conn = sqlite3.connect('places.sqlite') | |
c = conn.cursor() | |
rows = c.execute( | |
'SELECT DISTINCT visit_date, url, title ' + | |
'FROM moz_historyvisits ' + | |
'LEFT JOIN moz_places ' + | |
'ON moz_historyvisits.place_id = moz_places.id ' + | |
'WHERE url NOT LIKE "%facebook%" ' + | |
'AND url NOT LIKE "%google%" ' + | |
'AND url NOT LIKE "%mozilla%" ' + | |
'AND url NOT LIKE "%amazon%" ' + | |
'AND url NOT LIKE "%messenger%" ' + | |
'AND url NOT LIKE "%.toronto.edu%" ' + | |
'AND url NOT LIKE "%127.0.0.1%" ' + | |
'ORDER BY visit_date') | |
return np.array([(np.array(row[0] / float(1000000000) / float(60)), row[1], row[2] ) for row in rows]) | |
def get_history_visit_keywords(visit): | |
url = unicodedata.normalize('NFKD', visit[1]).encode('ascii','ignore').translate(None, digits) | |
title = visit[2] | |
if title is not None: | |
title = unicodedata.normalize('NFKD', visit[2]).encode('ascii','ignore').translate(None, digits) | |
url_keywords = re.split('//|/|=&|=|&|\?|\.|\+|-| ', url) | |
title_keywords = [] | |
if title is not None: | |
title_keywords = re.split('//|/|=&|=|&|\?|\.|\+|-| ', title) | |
keywords = url_keywords + title_keywords | |
return ' '.join(keywords).split() # Removing empty strings | |
def map_keywords_to_url(keywords, url): | |
for keyword in keywords: | |
if keyword not in KEYWORDS_TO_URLS: | |
KEYWORDS_TO_URLS[keyword] = set() | |
KEYWORDS_TO_URLS[keyword].add(url) | |
def generate_keyword_url_mappings(history): | |
for visit in history: | |
url = visit[1] | |
keywords = get_history_visit_keywords(visit) | |
map_keywords_to_url(keywords, url) | |
if url not in URLS_TO_KEYWORDS: | |
URLS_TO_KEYWORDS[url] = [] | |
URLS_TO_KEYWORDS[url] += keywords | |
def get_or_generate_model(visits): | |
try: | |
kmeans = joblib.load('datetime_clusted_model.pkl') | |
return kmeans | |
except: | |
data = np.array([np.array(visit[0]) for visit in visits]) | |
kmeans = KMeans(init='k-means++', n_clusters=2000, n_init=10) | |
kmeans.fit(data.reshape(-1, 1)) | |
joblib.dump(kmeans, 'datetime_clusted_model.pkl') | |
return kmeans | |
def generate_document_salient_keywords(model, visits): | |
try: | |
tf_idf = joblib.load('tf_idf_model.pkl') | |
features = joblib.load('features.pkl') | |
return tf_idf, features | |
except: | |
CLUSTER_TO_KEYWORDS = {} | |
labels = model.labels_ | |
for i in xrange(len(set(labels))): | |
cluster_mask = np.zeros_like(labels, dtype=bool) | |
cluster_mask[labels == i] = True | |
CLUSTER_TO_KEYWORDS[i] = "" | |
for visit in visits[cluster_mask]: | |
CLUSTER_TO_KEYWORDS[i] += ' '.join(URLS_TO_KEYWORDS[visit[1]]) | |
corpus = CLUSTER_TO_KEYWORDS.values() | |
vectorizer = TfidfVectorizer(min_df=1) | |
tf_idf = vectorizer.fit_transform(corpus) | |
features = vectorizer.get_feature_names() | |
joblib.dump(tf_idf, 'tf_idf_model.pkl') | |
joblib.dump(features, 'features.pkl') | |
return tf_idf, features | |
def get_top_salient_words_per_document(tf_idf, features): | |
indices = np.argsort(-tf_idf.toarray()) | |
top_n = 100 | |
for cluster_id, document_indecies in enumerate(indices): | |
file_path = 'corpus/cluster_' + str(cluster_id) | |
if os.path.exists(file_path): | |
continue | |
keywords_string = " ".join([features[i] for i in document_indecies[:top_n]]) | |
f = open(file_path, 'w') | |
f.write(keywords_string) | |
f.close() | |
def apply_word2vec(): | |
sentences = IterableSentences(PATH_TO_CORPUS) | |
model = models.Word2Vec(sentences, min_count=10, size=150, sg=0, iter=10, window=10, workers=6) | |
print model.most_similar("hotel") | |
model.save("model") | |
if __name__ == "__main__": | |
history_visits = get_filtered_history_visits() | |
generate_keyword_url_mappings(history_visits) | |
print "There are " + str(len(history_visits)) + " history visits." | |
print "There are " + str(len(URLS_TO_KEYWORDS)) + " distinct urls." | |
clustering_model = get_or_generate_model(history_visits) | |
tf_idf_model, features = generate_document_salient_keywords(clustering_model, history_visits) | |
get_top_salient_words_per_document(tf_idf_model, features) | |
apply_word2vec() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment