mooreniemi/ctr.py

## ctr.py
# This is a version of 'Learning from Clicks' in 'Programming Collective
# Intelligence' p74, a pdf is online at the below url
# https://github.com/ShawnLeee/the-book/blob/master/pybooks/Programming%20Collective%20Intelligence.pdf
#
# This is a fully connected network stored in sqlite, so, yeah, not for Production. :)

from math import tanh

from timeit import default_timer as timer

import pandas as pd
import numpy as np

import sys

import zlib

import time

import sqlite3


def dtanh(y):
    return 1.0 - y * y


def url_to_id(word):
    return word_to_id(word)  # right now the same...


def word_to_id(word):
    word = str(word)  # force it
    try:
        return zlib.adler32(word.encode("utf-8"))
    except Exception as e:
        print(f"{word} threw {e}")


def word_ids_keyer(word_ids):
    return "_".join(sorted([str(wi) for wi in word_ids]))


def setup():
    conn = sqlite3.connect("ctr.db")
    res = conn.execute(
        "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='hidden_node'"
    ).fetchone()
    if res[0] == 0:
        print("creating tables...")
        # for speed... https://blog.devart.com/increasing-sqlite-performance.html
        conn.execute("PRAGMA journal_mode=WAL")
        conn.execute("PRAGMA locking_mode=EXCLUSIVE")

        conn.execute("create table hidden_node(create_key)")
        conn.execute("create index hidden_create_keys on hidden_node (create_key)")
        conn.execute("create table word_to_hidden(from_id, to_id, strength)")
        conn.execute("create index w2h_from_to_ids on word_to_hidden (from_id, to_id)")
        conn.execute("create table hidden_to_url(from_id, to_id, strength)")
        conn.execute("create index h2u_from_to_ids on hidden_to_url (from_id, to_id)")
    else:
        print("found tables...")
    conn.close()


class click_net:
    def __del__(self):
        self.conn.close()

    def __init__(self, verbose=False):
        self.verbose = verbose
        self.conn = sqlite3.connect("ctr.db")

    def setup(self, word_ids, url_ids):
        self.word_ids = word_ids
        self.url_ids = url_ids
        self.hidden_ids = self.hidden_nodes_for(word_ids, url_ids)
        if self.verbose:
            print(f"hidden_ids: {self.hidden_ids}")

        self.ai = [1.0] * len(self.word_ids)
        self.ah = [1.0] * len(self.hidden_ids)
        self.ao = [1.0] * len(self.url_ids)

        self.wi = [
            [self.get_strength(word_id, hidden_id, 0) for hidden_id in self.hidden_ids]
            for word_id in self.word_ids
        ]
        self.wo = [
            [self.get_strength(hidden_id, url_id, 1) for url_id in self.url_ids]
            for hidden_id in self.hidden_ids
        ]

        if self.verbose:
            print("word input")
            print(self.wi)
            print("word output")
            print(self.wo)

    def get_strength(self, from_id, to_id, layer):
        if layer == 0:
            table = "word_to_hidden"
        else:
            table = "hidden_to_url"
        res = self.conn.execute(
            f"select strength from {table} where from_id={from_id} and to_id={to_id}"
        ).fetchone()
        if not res:
            if layer == 0:
                return -0.2
            if layer == 1:
                return 0
        return res[0]

    def set_strength(self, from_id, to_id, layer, strength):
        assert from_id != None, "from_id was None, this would be meaingless"
        assert to_id != None, "to_id was None, this would be meaingless"
        if layer == 0:
            table = "word_to_hidden"
        else:
            table = "hidden_to_url"
        # FIXME: use upsert as it's supported in sqlite 24+
        res = self.conn.execute(
            f"select rowid from {table} where from_id={from_id} and to_id={to_id}"
        ).fetchone()
        if not res:
            self.conn.execute(
                f"insert into {table} (from_id,to_id,strength) values ({from_id},{to_id},{strength})"
            )
        else:
            self.conn.execute(
                f"update {table} set strength={strength} where rowid={res[0]}"
            )
        self.conn.commit()

    def create_hidden_node(self, word_ids, urls):
        hidden_id = word_ids_keyer(word_ids)
        res = self.conn.execute(
            f"select rowid from hidden_node where create_key='{hidden_id}'"
        ).fetchone()
        if not res:
            cur = self.conn.execute(
                f"insert into hidden_node (create_key) values ('{hidden_id}')"
            )
            row_id = cur.lastrowid
            for word_id in word_ids:
                self.set_strength(word_id, row_id, 0, 1.0 / len(word_ids))
            for url_id in urls:
                self.set_strength(row_id, url_id, 1, 0.1)
            self.conn.commit()

    def hidden_nodes_for(self, word_ids, url_ids):
        l1 = {}
        for word_id in word_ids:
            cur = self.conn.execute(
                f"select to_id from word_to_hidden where from_id={word_id}"
            )
            for row in cur:
                l1[row[0]] = 1
        for url_id in url_ids:
            cur = self.conn.execute(
                f"select from_id from hidden_to_url where to_id={url_id}"
            )
            for row in cur:
                l1[row[0]] = 1
        return list(l1.keys())

    def feed_forward(self):
        # query words are inputs
        for i in range(len(self.word_ids)):
            self.ai[i] = 1.0
        if self.verbose:
            print(f"will be {len(self.hidden_ids)} x {len(self.word_ids)}")
        # hidden activations
        for j in range(len(self.hidden_ids)):
            sum = 0.0
            for i in range(len(self.word_ids)):
                sum = sum + self.ai[i] * self.wi[i][j]
            self.ah[j] = tanh(sum)
        # outputs
        if self.verbose:
            print(f"will be {len(self.url_ids)} x {len(self.hidden_ids)}")
        for k in range(len(self.url_ids)):
            sum = 0.0
            for j in range(len(self.hidden_ids)):
                sum = sum + self.ah[j] * self.wo[j][k]
            self.ao[k] = tanh(sum)
        return self.ao[:]

    def update(self):
        for i in range(len(self.word_ids)):
            for j in range(len(self.hidden_ids)):
                self.set_strength(
                    self.word_ids[i], self.hidden_ids[j], 0, self.wi[i][j]
                )
        for j in range(len(self.hidden_ids)):
            for k in range(len(self.url_ids)):
                self.set_strength(self.hidden_ids[j], self.url_ids[k], 1, self.wo[j][k])
        self.conn.commit()

    def back_prop(self, targets, n=0.5):
        # calculate errors for output
        output_deltas = [0.0] * len(self.url_ids)
        for k in range(len(self.url_ids)):
            error = targets[k] - self.ao[k]
            output_deltas[k] = dtanh(self.ao[k]) * error
        # calculate errors for hidden layer
        hidden_deltas = [0.0] * len(self.hidden_ids)
        for j in range(len(self.hidden_ids)):
            error = 0.0
            for k in range(len(self.url_ids)):
                error = error + output_deltas[k] * self.wo[j][k]
            hidden_deltas[j] = dtanh(self.ah[j]) * error
        # update output weights
        for j in range(len(self.hidden_ids)):
            for k in range(len(self.url_ids)):
                change = output_deltas[k] * self.ah[j]
                self.wo[j][k] = self.wo[j][k] + n * change
        # update input weights
        for i in range(len(self.word_ids)):
            for j in range(len(self.hidden_ids)):
                change = hidden_deltas[j] * self.ai[i]
                self.wi[i][j] = self.wi[i][j] + n * change
        return error

    def predict(self, word_ids, url_ids):
        start = timer()
        self.setup(word_ids, url_ids)
        end = timer()
        if self.verbose:
            print(f"setup: {end - start}")  # Time in seconds, e.g. 5.38091952400282
        score = self.feed_forward()
        score_end = timer()
        if self.verbose:
            print(f"score: {score_end - end}")  # Time in seconds, e.g. 5.38091952400282
        return score

    def train(self, word_ids, url_ids, clicked_urls):
        self.create_hidden_node(word_ids, url_ids)
        self.setup(word_ids, url_ids)
        self.feed_forward()
        targets = [0.0] * len(url_ids)
        for click in clicked_urls:
            targets[click] = 1.0
        error = self.back_prop(targets)
        if self.verbose:
            print(error)
        self.update()
        return error


if __name__ == "__main__":
    setup()
    cn = click_net(verbose=False)

    # this is the examples directly from the book for double checking
    if sys.argv[1] == "test":
        # some example data from the book that gives expected outputs
        wWorld, wRiver, wBank = [word_to_id(w) for w in ["world", "river", "bank"]]
        uWorldBank, uRiver, uEarth = [
            url_to_id(w)
            for w in [
                "http://www.world-bank.org",
                "http://wiki.com/river",
                "http://earth.com",
            ]
        ]

        w2idx = {"world": 111542825, "river": 109117993, "bank": 66453917}
        idx2w = {111542825: "world", 109117993: "river", 66453917: "bank"}

        u2idx = {
            "http://www.world-bank.org": 2026834259,
            "http://wiki.com/river": 1415710673,
            "http://earth.com": 831915482,
        }
        idx2u = {
            2026834259: "http://www.world-bank.org",
            1415710673: "http://wiki.com/river",
            831915482: "http://earth.com",
        }

        all_urls = [uWorldBank, uRiver, uEarth]

        cn.create_hidden_node([wWorld, wBank], all_urls)
        print("word_to_hidden:")
        for r in cn.conn.execute("select * from word_to_hidden"):
            print(r)
        print("hidden_to_url:")
        for r in cn.conn.execute("select * from hidden_to_url"):
            print(r)
        print("predict...")
        print(cn.predict([wWorld, wBank], all_urls))

        for i in range(10):
            cn.train([wWorld, wBank], all_urls, [0])
            cn.train([wRiver, wBank], all_urls, [1])
            cn.train([wWorld], all_urls, [2])

        def get_url(prediction):
            print(f"prediction was: {prediction}")
            return idx2u[all_urls[prediction.index(max(prediction))]]

        print("world bank...")
        world_bank = cn.predict([wWorld, wBank], all_urls)
        print(get_url(world_bank))
        print("river bank...")
        river_bank = cn.predict([wRiver, wBank], all_urls)
        print(get_url(river_bank))
        print("bank...")
        bank = cn.predict([wBank], all_urls)
        print(get_url(bank))

    # The section below this point is not in the book, but trying to use it on other data...

    # this is to operate on "real click data"
    if sys.argv[1] == "train":
        # parquet because we are parsing columns of lists, which csv can't support
        # parquet looks like:
        # str keywords,
        # list str url,
        # list float pr_v2_score where pr_v2_score is a BERT score for Passage in page
        training_data = pd.read_parquet("training_data.parquet")  # .head(100)

        def on_row(keywords, url, pr_v2_score):
            print(f"on keywords: {keywords}")
            word_ids = [word_to_id(w) for w in keywords.split(" ")]
            url_ids = [url_to_id(u) for u in url]
            # NOTE: click threshold is here...
            clicks = [idx for idx, score in enumerate(pr_v2_score) if score > 0.05]
            return cn.train(word_ids, url_ids, clicks)

        training_data["error"] = np.vectorize(on_row)(
            training_data["keywords"],
            training_data["url"],
            training_data["pr_v2_score"],
        )
        print(training_data["error"].describe())

    # after having trained with the "click data" this gives us a sandbox
    if sys.argv[1] == "repl":
        clicks = pd.read_csv("clicks.csv")

        def query(words, urls):
            word_ids = [word_to_id(w) for w in words.split(" ")]
            url_ids = [url_to_id(u) for u in urls]
            predicted = cn.predict(word_ids, url_ids)
            return sorted(list(zip(predicted, urls)), key=lambda x: x[0])

        print("cn and clicks (.csv) in scope... try something like:")
        print("sample = clicks.head(50)")
        print("that has url column, so you can grab list(sample['url'])")
        print("query('some words', list_of_urls)")

        from IPython import embed

        embed()
	# This is a version of 'Learning from Clicks' in 'Programming Collective
	# Intelligence' p74, a pdf is online at the below url
	# https://github.com/ShawnLeee/the-book/blob/master/pybooks/Programming%20Collective%20Intelligence.pdf
	#
	# This is a fully connected network stored in sqlite, so, yeah, not for Production. :)

	from math import tanh

	from timeit import default_timer as timer

	import pandas as pd
	import numpy as np

	import sys

	import zlib

	import time

	import sqlite3


	def dtanh(y):
	return 1.0 - y * y


	def url_to_id(word):
	return word_to_id(word) # right now the same...


	def word_to_id(word):
	word = str(word) # force it
	try:
	return zlib.adler32(word.encode("utf-8"))
	except Exception as e:
	print(f"{word} threw {e}")


	def word_ids_keyer(word_ids):
	return "_".join(sorted([str(wi) for wi in word_ids]))


	def setup():
	conn = sqlite3.connect("ctr.db")
	res = conn.execute(
	"SELECT count(*) FROM sqlite_master WHERE type='table' AND name='hidden_node'"
	).fetchone()
	if res[0] == 0:
	print("creating tables...")
	# for speed... https://blog.devart.com/increasing-sqlite-performance.html
	conn.execute("PRAGMA journal_mode=WAL")
	conn.execute("PRAGMA locking_mode=EXCLUSIVE")

	conn.execute("create table hidden_node(create_key)")
	conn.execute("create index hidden_create_keys on hidden_node (create_key)")
	conn.execute("create table word_to_hidden(from_id, to_id, strength)")
	conn.execute("create index w2h_from_to_ids on word_to_hidden (from_id, to_id)")
	conn.execute("create table hidden_to_url(from_id, to_id, strength)")
	conn.execute("create index h2u_from_to_ids on hidden_to_url (from_id, to_id)")
	else:
	print("found tables...")
	conn.close()


	class click_net:
	def __del__(self):
	self.conn.close()

	def __init__(self, verbose=False):
	self.verbose = verbose
	self.conn = sqlite3.connect("ctr.db")

	def setup(self, word_ids, url_ids):
	self.word_ids = word_ids
	self.url_ids = url_ids
	self.hidden_ids = self.hidden_nodes_for(word_ids, url_ids)
	if self.verbose:
	print(f"hidden_ids: {self.hidden_ids}")

	self.ai = [1.0] * len(self.word_ids)
	self.ah = [1.0] * len(self.hidden_ids)
	self.ao = [1.0] * len(self.url_ids)

	self.wi = [
	[self.get_strength(word_id, hidden_id, 0) for hidden_id in self.hidden_ids]
	for word_id in self.word_ids
	]
	self.wo = [
	[self.get_strength(hidden_id, url_id, 1) for url_id in self.url_ids]
	for hidden_id in self.hidden_ids
	]

	if self.verbose:
	print("word input")
	print(self.wi)
	print("word output")
	print(self.wo)

	def get_strength(self, from_id, to_id, layer):
	if layer == 0:
	table = "word_to_hidden"
	else:
	table = "hidden_to_url"
	res = self.conn.execute(
	f"select strength from {table} where from_id={from_id} and to_id={to_id}"
	).fetchone()
	if not res:
	if layer == 0:
	return -0.2
	if layer == 1:
	return 0
	return res[0]

	def set_strength(self, from_id, to_id, layer, strength):
	assert from_id != None, "from_id was None, this would be meaingless"
	assert to_id != None, "to_id was None, this would be meaingless"
	if layer == 0:
	table = "word_to_hidden"
	else:
	table = "hidden_to_url"
	# FIXME: use upsert as it's supported in sqlite 24+
	res = self.conn.execute(
	f"select rowid from {table} where from_id={from_id} and to_id={to_id}"
	).fetchone()
	if not res:
	self.conn.execute(
	f"insert into {table} (from_id,to_id,strength) values ({from_id},{to_id},{strength})"
	)
	else:
	self.conn.execute(
	f"update {table} set strength={strength} where rowid={res[0]}"
	)
	self.conn.commit()

	def create_hidden_node(self, word_ids, urls):
	hidden_id = word_ids_keyer(word_ids)
	res = self.conn.execute(
	f"select rowid from hidden_node where create_key='{hidden_id}'"
	).fetchone()
	if not res:
	cur = self.conn.execute(
	f"insert into hidden_node (create_key) values ('{hidden_id}')"
	)
	row_id = cur.lastrowid
	for word_id in word_ids:
	self.set_strength(word_id, row_id, 0, 1.0 / len(word_ids))
	for url_id in urls:
	self.set_strength(row_id, url_id, 1, 0.1)
	self.conn.commit()

	def hidden_nodes_for(self, word_ids, url_ids):
	l1 = {}
	for word_id in word_ids:
	cur = self.conn.execute(
	f"select to_id from word_to_hidden where from_id={word_id}"
	)
	for row in cur:
	l1[row[0]] = 1
	for url_id in url_ids:
	cur = self.conn.execute(
	f"select from_id from hidden_to_url where to_id={url_id}"
	)
	for row in cur:
	l1[row[0]] = 1
	return list(l1.keys())

	def feed_forward(self):
	# query words are inputs
	for i in range(len(self.word_ids)):
	self.ai[i] = 1.0
	if self.verbose:
	print(f"will be {len(self.hidden_ids)} x {len(self.word_ids)}")
	# hidden activations
	for j in range(len(self.hidden_ids)):
	sum = 0.0
	for i in range(len(self.word_ids)):
	sum = sum + self.ai[i] * self.wi[i][j]
	self.ah[j] = tanh(sum)
	# outputs
	if self.verbose:
	print(f"will be {len(self.url_ids)} x {len(self.hidden_ids)}")
	for k in range(len(self.url_ids)):
	sum = 0.0
	for j in range(len(self.hidden_ids)):
	sum = sum + self.ah[j] * self.wo[j][k]
	self.ao[k] = tanh(sum)
	return self.ao[:]

	def update(self):
	for i in range(len(self.word_ids)):
	for j in range(len(self.hidden_ids)):
	self.set_strength(
	self.word_ids[i], self.hidden_ids[j], 0, self.wi[i][j]
	)
	for j in range(len(self.hidden_ids)):
	for k in range(len(self.url_ids)):
	self.set_strength(self.hidden_ids[j], self.url_ids[k], 1, self.wo[j][k])
	self.conn.commit()

	def back_prop(self, targets, n=0.5):
	# calculate errors for output
	output_deltas = [0.0] * len(self.url_ids)
	for k in range(len(self.url_ids)):
	error = targets[k] - self.ao[k]
	output_deltas[k] = dtanh(self.ao[k]) * error
	# calculate errors for hidden layer
	hidden_deltas = [0.0] * len(self.hidden_ids)
	for j in range(len(self.hidden_ids)):
	error = 0.0
	for k in range(len(self.url_ids)):
	error = error + output_deltas[k] * self.wo[j][k]
	hidden_deltas[j] = dtanh(self.ah[j]) * error
	# update output weights
	for j in range(len(self.hidden_ids)):
	for k in range(len(self.url_ids)):
	change = output_deltas[k] * self.ah[j]
	self.wo[j][k] = self.wo[j][k] + n * change
	# update input weights
	for i in range(len(self.word_ids)):
	for j in range(len(self.hidden_ids)):
	change = hidden_deltas[j] * self.ai[i]
	self.wi[i][j] = self.wi[i][j] + n * change
	return error

	def predict(self, word_ids, url_ids):
	start = timer()
	self.setup(word_ids, url_ids)
	end = timer()
	if self.verbose:
	print(f"setup: {end - start}") # Time in seconds, e.g. 5.38091952400282
	score = self.feed_forward()
	score_end = timer()
	if self.verbose:
	print(f"score: {score_end - end}") # Time in seconds, e.g. 5.38091952400282
	return score

	def train(self, word_ids, url_ids, clicked_urls):
	self.create_hidden_node(word_ids, url_ids)
	self.setup(word_ids, url_ids)
	self.feed_forward()
	targets = [0.0] * len(url_ids)
	for click in clicked_urls:
	targets[click] = 1.0
	error = self.back_prop(targets)
	if self.verbose:
	print(error)
	self.update()
	return error


	if __name__ == "__main__":
	setup()
	cn = click_net(verbose=False)

	# this is the examples directly from the book for double checking
	if sys.argv[1] == "test":
	# some example data from the book that gives expected outputs
	wWorld, wRiver, wBank = [word_to_id(w) for w in ["world", "river", "bank"]]
	uWorldBank, uRiver, uEarth = [
	url_to_id(w)
	for w in [
	"http://www.world-bank.org",
	"http://wiki.com/river",
	"http://earth.com",
	]
	]

	w2idx = {"world": 111542825, "river": 109117993, "bank": 66453917}
	idx2w = {111542825: "world", 109117993: "river", 66453917: "bank"}

	u2idx = {
	"http://www.world-bank.org": 2026834259,
	"http://wiki.com/river": 1415710673,
	"http://earth.com": 831915482,
	}
	idx2u = {
	2026834259: "http://www.world-bank.org",
	1415710673: "http://wiki.com/river",
	831915482: "http://earth.com",
	}

	all_urls = [uWorldBank, uRiver, uEarth]

	cn.create_hidden_node([wWorld, wBank], all_urls)
	print("word_to_hidden:")
	for r in cn.conn.execute("select * from word_to_hidden"):
	print(r)
	print("hidden_to_url:")
	for r in cn.conn.execute("select * from hidden_to_url"):
	print(r)
	print("predict...")
	print(cn.predict([wWorld, wBank], all_urls))

	for i in range(10):
	cn.train([wWorld, wBank], all_urls, [0])
	cn.train([wRiver, wBank], all_urls, [1])
	cn.train([wWorld], all_urls, [2])

	def get_url(prediction):
	print(f"prediction was: {prediction}")
	return idx2u[all_urls[prediction.index(max(prediction))]]

	print("world bank...")
	world_bank = cn.predict([wWorld, wBank], all_urls)
	print(get_url(world_bank))
	print("river bank...")
	river_bank = cn.predict([wRiver, wBank], all_urls)
	print(get_url(river_bank))
	print("bank...")
	bank = cn.predict([wBank], all_urls)
	print(get_url(bank))

	# The section below this point is not in the book, but trying to use it on other data...

	# this is to operate on "real click data"
	if sys.argv[1] == "train":
	# parquet because we are parsing columns of lists, which csv can't support
	# parquet looks like:
	# str keywords,
	# list str url,
	# list float pr_v2_score where pr_v2_score is a BERT score for Passage in page
	training_data = pd.read_parquet("training_data.parquet") # .head(100)

	def on_row(keywords, url, pr_v2_score):
	print(f"on keywords: {keywords}")
	word_ids = [word_to_id(w) for w in keywords.split(" ")]
	url_ids = [url_to_id(u) for u in url]
	# NOTE: click threshold is here...
	clicks = [idx for idx, score in enumerate(pr_v2_score) if score > 0.05]
	return cn.train(word_ids, url_ids, clicks)

	training_data["error"] = np.vectorize(on_row)(
	training_data["keywords"],
	training_data["url"],
	training_data["pr_v2_score"],
	)
	print(training_data["error"].describe())

	# after having trained with the "click data" this gives us a sandbox
	if sys.argv[1] == "repl":
	clicks = pd.read_csv("clicks.csv")

	def query(words, urls):
	word_ids = [word_to_id(w) for w in words.split(" ")]
	url_ids = [url_to_id(u) for u in urls]
	predicted = cn.predict(word_ids, url_ids)
	return sorted(list(zip(predicted, urls)), key=lambda x: x[0])

	print("cn and clicks (.csv) in scope... try something like:")
	print("sample = clicks.head(50)")
	print("that has url column, so you can grab list(sample['url'])")
	print("query('some words', list_of_urls)")

	from IPython import embed

	embed()