Created
July 20, 2020 01:13
-
-
Save mooreniemi/3091d78b6620bb354c4b68c00305b85b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a version of 'Learning from Clicks' in 'Programming Collective | |
# Intelligence' p74, a pdf is online at the below url | |
# https://github.com/ShawnLeee/the-book/blob/master/pybooks/Programming%20Collective%20Intelligence.pdf | |
# | |
# This is a fully connected network stored in sqlite, so, yeah, not for Production. :) | |
from math import tanh | |
from timeit import default_timer as timer | |
import pandas as pd | |
import numpy as np | |
import sys | |
import zlib | |
import time | |
import sqlite3 | |
def dtanh(y): | |
return 1.0 - y * y | |
def url_to_id(word): | |
return word_to_id(word) # right now the same... | |
def word_to_id(word): | |
word = str(word) # force it | |
try: | |
return zlib.adler32(word.encode("utf-8")) | |
except Exception as e: | |
print(f"{word} threw {e}") | |
def word_ids_keyer(word_ids): | |
return "_".join(sorted([str(wi) for wi in word_ids])) | |
def setup(): | |
conn = sqlite3.connect("ctr.db") | |
res = conn.execute( | |
"SELECT count(*) FROM sqlite_master WHERE type='table' AND name='hidden_node'" | |
).fetchone() | |
if res[0] == 0: | |
print("creating tables...") | |
# for speed... https://blog.devart.com/increasing-sqlite-performance.html | |
conn.execute("PRAGMA journal_mode=WAL") | |
conn.execute("PRAGMA locking_mode=EXCLUSIVE") | |
conn.execute("create table hidden_node(create_key)") | |
conn.execute("create index hidden_create_keys on hidden_node (create_key)") | |
conn.execute("create table word_to_hidden(from_id, to_id, strength)") | |
conn.execute("create index w2h_from_to_ids on word_to_hidden (from_id, to_id)") | |
conn.execute("create table hidden_to_url(from_id, to_id, strength)") | |
conn.execute("create index h2u_from_to_ids on hidden_to_url (from_id, to_id)") | |
else: | |
print("found tables...") | |
conn.close() | |
class click_net: | |
def __del__(self): | |
self.conn.close() | |
def __init__(self, verbose=False): | |
self.verbose = verbose | |
self.conn = sqlite3.connect("ctr.db") | |
def setup(self, word_ids, url_ids): | |
self.word_ids = word_ids | |
self.url_ids = url_ids | |
self.hidden_ids = self.hidden_nodes_for(word_ids, url_ids) | |
if self.verbose: | |
print(f"hidden_ids: {self.hidden_ids}") | |
self.ai = [1.0] * len(self.word_ids) | |
self.ah = [1.0] * len(self.hidden_ids) | |
self.ao = [1.0] * len(self.url_ids) | |
self.wi = [ | |
[self.get_strength(word_id, hidden_id, 0) for hidden_id in self.hidden_ids] | |
for word_id in self.word_ids | |
] | |
self.wo = [ | |
[self.get_strength(hidden_id, url_id, 1) for url_id in self.url_ids] | |
for hidden_id in self.hidden_ids | |
] | |
if self.verbose: | |
print("word input") | |
print(self.wi) | |
print("word output") | |
print(self.wo) | |
def get_strength(self, from_id, to_id, layer): | |
if layer == 0: | |
table = "word_to_hidden" | |
else: | |
table = "hidden_to_url" | |
res = self.conn.execute( | |
f"select strength from {table} where from_id={from_id} and to_id={to_id}" | |
).fetchone() | |
if not res: | |
if layer == 0: | |
return -0.2 | |
if layer == 1: | |
return 0 | |
return res[0] | |
def set_strength(self, from_id, to_id, layer, strength): | |
assert from_id != None, "from_id was None, this would be meaingless" | |
assert to_id != None, "to_id was None, this would be meaingless" | |
if layer == 0: | |
table = "word_to_hidden" | |
else: | |
table = "hidden_to_url" | |
# FIXME: use upsert as it's supported in sqlite 24+ | |
res = self.conn.execute( | |
f"select rowid from {table} where from_id={from_id} and to_id={to_id}" | |
).fetchone() | |
if not res: | |
self.conn.execute( | |
f"insert into {table} (from_id,to_id,strength) values ({from_id},{to_id},{strength})" | |
) | |
else: | |
self.conn.execute( | |
f"update {table} set strength={strength} where rowid={res[0]}" | |
) | |
self.conn.commit() | |
def create_hidden_node(self, word_ids, urls): | |
hidden_id = word_ids_keyer(word_ids) | |
res = self.conn.execute( | |
f"select rowid from hidden_node where create_key='{hidden_id}'" | |
).fetchone() | |
if not res: | |
cur = self.conn.execute( | |
f"insert into hidden_node (create_key) values ('{hidden_id}')" | |
) | |
row_id = cur.lastrowid | |
for word_id in word_ids: | |
self.set_strength(word_id, row_id, 0, 1.0 / len(word_ids)) | |
for url_id in urls: | |
self.set_strength(row_id, url_id, 1, 0.1) | |
self.conn.commit() | |
def hidden_nodes_for(self, word_ids, url_ids): | |
l1 = {} | |
for word_id in word_ids: | |
cur = self.conn.execute( | |
f"select to_id from word_to_hidden where from_id={word_id}" | |
) | |
for row in cur: | |
l1[row[0]] = 1 | |
for url_id in url_ids: | |
cur = self.conn.execute( | |
f"select from_id from hidden_to_url where to_id={url_id}" | |
) | |
for row in cur: | |
l1[row[0]] = 1 | |
return list(l1.keys()) | |
def feed_forward(self): | |
# query words are inputs | |
for i in range(len(self.word_ids)): | |
self.ai[i] = 1.0 | |
if self.verbose: | |
print(f"will be {len(self.hidden_ids)} x {len(self.word_ids)}") | |
# hidden activations | |
for j in range(len(self.hidden_ids)): | |
sum = 0.0 | |
for i in range(len(self.word_ids)): | |
sum = sum + self.ai[i] * self.wi[i][j] | |
self.ah[j] = tanh(sum) | |
# outputs | |
if self.verbose: | |
print(f"will be {len(self.url_ids)} x {len(self.hidden_ids)}") | |
for k in range(len(self.url_ids)): | |
sum = 0.0 | |
for j in range(len(self.hidden_ids)): | |
sum = sum + self.ah[j] * self.wo[j][k] | |
self.ao[k] = tanh(sum) | |
return self.ao[:] | |
def update(self): | |
for i in range(len(self.word_ids)): | |
for j in range(len(self.hidden_ids)): | |
self.set_strength( | |
self.word_ids[i], self.hidden_ids[j], 0, self.wi[i][j] | |
) | |
for j in range(len(self.hidden_ids)): | |
for k in range(len(self.url_ids)): | |
self.set_strength(self.hidden_ids[j], self.url_ids[k], 1, self.wo[j][k]) | |
self.conn.commit() | |
def back_prop(self, targets, n=0.5): | |
# calculate errors for output | |
output_deltas = [0.0] * len(self.url_ids) | |
for k in range(len(self.url_ids)): | |
error = targets[k] - self.ao[k] | |
output_deltas[k] = dtanh(self.ao[k]) * error | |
# calculate errors for hidden layer | |
hidden_deltas = [0.0] * len(self.hidden_ids) | |
for j in range(len(self.hidden_ids)): | |
error = 0.0 | |
for k in range(len(self.url_ids)): | |
error = error + output_deltas[k] * self.wo[j][k] | |
hidden_deltas[j] = dtanh(self.ah[j]) * error | |
# update output weights | |
for j in range(len(self.hidden_ids)): | |
for k in range(len(self.url_ids)): | |
change = output_deltas[k] * self.ah[j] | |
self.wo[j][k] = self.wo[j][k] + n * change | |
# update input weights | |
for i in range(len(self.word_ids)): | |
for j in range(len(self.hidden_ids)): | |
change = hidden_deltas[j] * self.ai[i] | |
self.wi[i][j] = self.wi[i][j] + n * change | |
return error | |
def predict(self, word_ids, url_ids): | |
start = timer() | |
self.setup(word_ids, url_ids) | |
end = timer() | |
if self.verbose: | |
print(f"setup: {end - start}") # Time in seconds, e.g. 5.38091952400282 | |
score = self.feed_forward() | |
score_end = timer() | |
if self.verbose: | |
print(f"score: {score_end - end}") # Time in seconds, e.g. 5.38091952400282 | |
return score | |
def train(self, word_ids, url_ids, clicked_urls): | |
self.create_hidden_node(word_ids, url_ids) | |
self.setup(word_ids, url_ids) | |
self.feed_forward() | |
targets = [0.0] * len(url_ids) | |
for click in clicked_urls: | |
targets[click] = 1.0 | |
error = self.back_prop(targets) | |
if self.verbose: | |
print(error) | |
self.update() | |
return error | |
if __name__ == "__main__": | |
setup() | |
cn = click_net(verbose=False) | |
# this is the examples directly from the book for double checking | |
if sys.argv[1] == "test": | |
# some example data from the book that gives expected outputs | |
wWorld, wRiver, wBank = [word_to_id(w) for w in ["world", "river", "bank"]] | |
uWorldBank, uRiver, uEarth = [ | |
url_to_id(w) | |
for w in [ | |
"http://www.world-bank.org", | |
"http://wiki.com/river", | |
"http://earth.com", | |
] | |
] | |
w2idx = {"world": 111542825, "river": 109117993, "bank": 66453917} | |
idx2w = {111542825: "world", 109117993: "river", 66453917: "bank"} | |
u2idx = { | |
"http://www.world-bank.org": 2026834259, | |
"http://wiki.com/river": 1415710673, | |
"http://earth.com": 831915482, | |
} | |
idx2u = { | |
2026834259: "http://www.world-bank.org", | |
1415710673: "http://wiki.com/river", | |
831915482: "http://earth.com", | |
} | |
all_urls = [uWorldBank, uRiver, uEarth] | |
cn.create_hidden_node([wWorld, wBank], all_urls) | |
print("word_to_hidden:") | |
for r in cn.conn.execute("select * from word_to_hidden"): | |
print(r) | |
print("hidden_to_url:") | |
for r in cn.conn.execute("select * from hidden_to_url"): | |
print(r) | |
print("predict...") | |
print(cn.predict([wWorld, wBank], all_urls)) | |
for i in range(10): | |
cn.train([wWorld, wBank], all_urls, [0]) | |
cn.train([wRiver, wBank], all_urls, [1]) | |
cn.train([wWorld], all_urls, [2]) | |
def get_url(prediction): | |
print(f"prediction was: {prediction}") | |
return idx2u[all_urls[prediction.index(max(prediction))]] | |
print("world bank...") | |
world_bank = cn.predict([wWorld, wBank], all_urls) | |
print(get_url(world_bank)) | |
print("river bank...") | |
river_bank = cn.predict([wRiver, wBank], all_urls) | |
print(get_url(river_bank)) | |
print("bank...") | |
bank = cn.predict([wBank], all_urls) | |
print(get_url(bank)) | |
# The section below this point is not in the book, but trying to use it on other data... | |
# this is to operate on "real click data" | |
if sys.argv[1] == "train": | |
# parquet because we are parsing columns of lists, which csv can't support | |
# parquet looks like: | |
# str keywords, | |
# list str url, | |
# list float pr_v2_score where pr_v2_score is a BERT score for Passage in page | |
training_data = pd.read_parquet("training_data.parquet") # .head(100) | |
def on_row(keywords, url, pr_v2_score): | |
print(f"on keywords: {keywords}") | |
word_ids = [word_to_id(w) for w in keywords.split(" ")] | |
url_ids = [url_to_id(u) for u in url] | |
# NOTE: click threshold is here... | |
clicks = [idx for idx, score in enumerate(pr_v2_score) if score > 0.05] | |
return cn.train(word_ids, url_ids, clicks) | |
training_data["error"] = np.vectorize(on_row)( | |
training_data["keywords"], | |
training_data["url"], | |
training_data["pr_v2_score"], | |
) | |
print(training_data["error"].describe()) | |
# after having trained with the "click data" this gives us a sandbox | |
if sys.argv[1] == "repl": | |
clicks = pd.read_csv("clicks.csv") | |
def query(words, urls): | |
word_ids = [word_to_id(w) for w in words.split(" ")] | |
url_ids = [url_to_id(u) for u in urls] | |
predicted = cn.predict(word_ids, url_ids) | |
return sorted(list(zip(predicted, urls)), key=lambda x: x[0]) | |
print("cn and clicks (.csv) in scope... try something like:") | |
print("sample = clicks.head(50)") | |
print("that has url column, so you can grab list(sample['url'])") | |
print("query('some words', list_of_urls)") | |
from IPython import embed | |
embed() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment