Skip to content

Instantly share code, notes, and snippets.

@mooreniemi
Created July 20, 2020 01:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mooreniemi/3091d78b6620bb354c4b68c00305b85b to your computer and use it in GitHub Desktop.
Save mooreniemi/3091d78b6620bb354c4b68c00305b85b to your computer and use it in GitHub Desktop.
# This is a version of 'Learning from Clicks' in 'Programming Collective
# Intelligence' p74, a pdf is online at the below url
# https://github.com/ShawnLeee/the-book/blob/master/pybooks/Programming%20Collective%20Intelligence.pdf
#
# This is a fully connected network stored in sqlite, so, yeah, not for Production. :)
from math import tanh
from timeit import default_timer as timer
import pandas as pd
import numpy as np
import sys
import zlib
import time
import sqlite3
def dtanh(y):
return 1.0 - y * y
def url_to_id(word):
return word_to_id(word) # right now the same...
def word_to_id(word):
word = str(word) # force it
try:
return zlib.adler32(word.encode("utf-8"))
except Exception as e:
print(f"{word} threw {e}")
def word_ids_keyer(word_ids):
return "_".join(sorted([str(wi) for wi in word_ids]))
def setup():
conn = sqlite3.connect("ctr.db")
res = conn.execute(
"SELECT count(*) FROM sqlite_master WHERE type='table' AND name='hidden_node'"
).fetchone()
if res[0] == 0:
print("creating tables...")
# for speed... https://blog.devart.com/increasing-sqlite-performance.html
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA locking_mode=EXCLUSIVE")
conn.execute("create table hidden_node(create_key)")
conn.execute("create index hidden_create_keys on hidden_node (create_key)")
conn.execute("create table word_to_hidden(from_id, to_id, strength)")
conn.execute("create index w2h_from_to_ids on word_to_hidden (from_id, to_id)")
conn.execute("create table hidden_to_url(from_id, to_id, strength)")
conn.execute("create index h2u_from_to_ids on hidden_to_url (from_id, to_id)")
else:
print("found tables...")
conn.close()
class click_net:
def __del__(self):
self.conn.close()
def __init__(self, verbose=False):
self.verbose = verbose
self.conn = sqlite3.connect("ctr.db")
def setup(self, word_ids, url_ids):
self.word_ids = word_ids
self.url_ids = url_ids
self.hidden_ids = self.hidden_nodes_for(word_ids, url_ids)
if self.verbose:
print(f"hidden_ids: {self.hidden_ids}")
self.ai = [1.0] * len(self.word_ids)
self.ah = [1.0] * len(self.hidden_ids)
self.ao = [1.0] * len(self.url_ids)
self.wi = [
[self.get_strength(word_id, hidden_id, 0) for hidden_id in self.hidden_ids]
for word_id in self.word_ids
]
self.wo = [
[self.get_strength(hidden_id, url_id, 1) for url_id in self.url_ids]
for hidden_id in self.hidden_ids
]
if self.verbose:
print("word input")
print(self.wi)
print("word output")
print(self.wo)
def get_strength(self, from_id, to_id, layer):
if layer == 0:
table = "word_to_hidden"
else:
table = "hidden_to_url"
res = self.conn.execute(
f"select strength from {table} where from_id={from_id} and to_id={to_id}"
).fetchone()
if not res:
if layer == 0:
return -0.2
if layer == 1:
return 0
return res[0]
def set_strength(self, from_id, to_id, layer, strength):
assert from_id != None, "from_id was None, this would be meaingless"
assert to_id != None, "to_id was None, this would be meaingless"
if layer == 0:
table = "word_to_hidden"
else:
table = "hidden_to_url"
# FIXME: use upsert as it's supported in sqlite 24+
res = self.conn.execute(
f"select rowid from {table} where from_id={from_id} and to_id={to_id}"
).fetchone()
if not res:
self.conn.execute(
f"insert into {table} (from_id,to_id,strength) values ({from_id},{to_id},{strength})"
)
else:
self.conn.execute(
f"update {table} set strength={strength} where rowid={res[0]}"
)
self.conn.commit()
def create_hidden_node(self, word_ids, urls):
hidden_id = word_ids_keyer(word_ids)
res = self.conn.execute(
f"select rowid from hidden_node where create_key='{hidden_id}'"
).fetchone()
if not res:
cur = self.conn.execute(
f"insert into hidden_node (create_key) values ('{hidden_id}')"
)
row_id = cur.lastrowid
for word_id in word_ids:
self.set_strength(word_id, row_id, 0, 1.0 / len(word_ids))
for url_id in urls:
self.set_strength(row_id, url_id, 1, 0.1)
self.conn.commit()
def hidden_nodes_for(self, word_ids, url_ids):
l1 = {}
for word_id in word_ids:
cur = self.conn.execute(
f"select to_id from word_to_hidden where from_id={word_id}"
)
for row in cur:
l1[row[0]] = 1
for url_id in url_ids:
cur = self.conn.execute(
f"select from_id from hidden_to_url where to_id={url_id}"
)
for row in cur:
l1[row[0]] = 1
return list(l1.keys())
def feed_forward(self):
# query words are inputs
for i in range(len(self.word_ids)):
self.ai[i] = 1.0
if self.verbose:
print(f"will be {len(self.hidden_ids)} x {len(self.word_ids)}")
# hidden activations
for j in range(len(self.hidden_ids)):
sum = 0.0
for i in range(len(self.word_ids)):
sum = sum + self.ai[i] * self.wi[i][j]
self.ah[j] = tanh(sum)
# outputs
if self.verbose:
print(f"will be {len(self.url_ids)} x {len(self.hidden_ids)}")
for k in range(len(self.url_ids)):
sum = 0.0
for j in range(len(self.hidden_ids)):
sum = sum + self.ah[j] * self.wo[j][k]
self.ao[k] = tanh(sum)
return self.ao[:]
def update(self):
for i in range(len(self.word_ids)):
for j in range(len(self.hidden_ids)):
self.set_strength(
self.word_ids[i], self.hidden_ids[j], 0, self.wi[i][j]
)
for j in range(len(self.hidden_ids)):
for k in range(len(self.url_ids)):
self.set_strength(self.hidden_ids[j], self.url_ids[k], 1, self.wo[j][k])
self.conn.commit()
def back_prop(self, targets, n=0.5):
# calculate errors for output
output_deltas = [0.0] * len(self.url_ids)
for k in range(len(self.url_ids)):
error = targets[k] - self.ao[k]
output_deltas[k] = dtanh(self.ao[k]) * error
# calculate errors for hidden layer
hidden_deltas = [0.0] * len(self.hidden_ids)
for j in range(len(self.hidden_ids)):
error = 0.0
for k in range(len(self.url_ids)):
error = error + output_deltas[k] * self.wo[j][k]
hidden_deltas[j] = dtanh(self.ah[j]) * error
# update output weights
for j in range(len(self.hidden_ids)):
for k in range(len(self.url_ids)):
change = output_deltas[k] * self.ah[j]
self.wo[j][k] = self.wo[j][k] + n * change
# update input weights
for i in range(len(self.word_ids)):
for j in range(len(self.hidden_ids)):
change = hidden_deltas[j] * self.ai[i]
self.wi[i][j] = self.wi[i][j] + n * change
return error
def predict(self, word_ids, url_ids):
start = timer()
self.setup(word_ids, url_ids)
end = timer()
if self.verbose:
print(f"setup: {end - start}") # Time in seconds, e.g. 5.38091952400282
score = self.feed_forward()
score_end = timer()
if self.verbose:
print(f"score: {score_end - end}") # Time in seconds, e.g. 5.38091952400282
return score
def train(self, word_ids, url_ids, clicked_urls):
self.create_hidden_node(word_ids, url_ids)
self.setup(word_ids, url_ids)
self.feed_forward()
targets = [0.0] * len(url_ids)
for click in clicked_urls:
targets[click] = 1.0
error = self.back_prop(targets)
if self.verbose:
print(error)
self.update()
return error
if __name__ == "__main__":
setup()
cn = click_net(verbose=False)
# this is the examples directly from the book for double checking
if sys.argv[1] == "test":
# some example data from the book that gives expected outputs
wWorld, wRiver, wBank = [word_to_id(w) for w in ["world", "river", "bank"]]
uWorldBank, uRiver, uEarth = [
url_to_id(w)
for w in [
"http://www.world-bank.org",
"http://wiki.com/river",
"http://earth.com",
]
]
w2idx = {"world": 111542825, "river": 109117993, "bank": 66453917}
idx2w = {111542825: "world", 109117993: "river", 66453917: "bank"}
u2idx = {
"http://www.world-bank.org": 2026834259,
"http://wiki.com/river": 1415710673,
"http://earth.com": 831915482,
}
idx2u = {
2026834259: "http://www.world-bank.org",
1415710673: "http://wiki.com/river",
831915482: "http://earth.com",
}
all_urls = [uWorldBank, uRiver, uEarth]
cn.create_hidden_node([wWorld, wBank], all_urls)
print("word_to_hidden:")
for r in cn.conn.execute("select * from word_to_hidden"):
print(r)
print("hidden_to_url:")
for r in cn.conn.execute("select * from hidden_to_url"):
print(r)
print("predict...")
print(cn.predict([wWorld, wBank], all_urls))
for i in range(10):
cn.train([wWorld, wBank], all_urls, [0])
cn.train([wRiver, wBank], all_urls, [1])
cn.train([wWorld], all_urls, [2])
def get_url(prediction):
print(f"prediction was: {prediction}")
return idx2u[all_urls[prediction.index(max(prediction))]]
print("world bank...")
world_bank = cn.predict([wWorld, wBank], all_urls)
print(get_url(world_bank))
print("river bank...")
river_bank = cn.predict([wRiver, wBank], all_urls)
print(get_url(river_bank))
print("bank...")
bank = cn.predict([wBank], all_urls)
print(get_url(bank))
# The section below this point is not in the book, but trying to use it on other data...
# this is to operate on "real click data"
if sys.argv[1] == "train":
# parquet because we are parsing columns of lists, which csv can't support
# parquet looks like:
# str keywords,
# list str url,
# list float pr_v2_score where pr_v2_score is a BERT score for Passage in page
training_data = pd.read_parquet("training_data.parquet") # .head(100)
def on_row(keywords, url, pr_v2_score):
print(f"on keywords: {keywords}")
word_ids = [word_to_id(w) for w in keywords.split(" ")]
url_ids = [url_to_id(u) for u in url]
# NOTE: click threshold is here...
clicks = [idx for idx, score in enumerate(pr_v2_score) if score > 0.05]
return cn.train(word_ids, url_ids, clicks)
training_data["error"] = np.vectorize(on_row)(
training_data["keywords"],
training_data["url"],
training_data["pr_v2_score"],
)
print(training_data["error"].describe())
# after having trained with the "click data" this gives us a sandbox
if sys.argv[1] == "repl":
clicks = pd.read_csv("clicks.csv")
def query(words, urls):
word_ids = [word_to_id(w) for w in words.split(" ")]
url_ids = [url_to_id(u) for u in urls]
predicted = cn.predict(word_ids, url_ids)
return sorted(list(zip(predicted, urls)), key=lambda x: x[0])
print("cn and clicks (.csv) in scope... try something like:")
print("sample = clicks.head(50)")
print("that has url column, so you can grab list(sample['url'])")
print("query('some words', list_of_urls)")
from IPython import embed
embed()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment