DennisSoemers/AdaptingConceptDriftPseudocode.py

## AdaptingConceptDriftPseudocode.py
"""
This file contains some very high-level pseudocode for the paper:

	Dennis J.N.J. Soemers, Tim Brys, Kurt Driessens, Mark H.M. Winands, and Ann Nowé (2018).
	“Adapting to Concept Drift in Credit Card Transaction Data Streams Using Contextual Bandits and Decision Trees”.
	In Thirtieth Annual Conference on Innovative Applications of Artificial Intelligence (IAAI-18), pp. 7831-7836. AAAI Press.

	Paper link: https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16183

Note that this pseudocode was not published simultaneously with the paper, but written a while later
due to requests for source code / pseudocode.

This pseudocode is written in a .py file primarily to get nice syntax highlighting, but it is
by no means directly-runnable source code.

When in doubt, trust what's written in the paper over this (this was written quite a while later,
from memory). If anything looks wrong, or for any other questions, please feel free to contact me:

@author Dennis Soemers ( d<DOT>soemers<AT>gmail<DOT>com )
"""


import numpy as np

'''
Some hyperparameters
'''

# Number of models per node to train on different bootstraps of data (see Algorithm 1 in paper)
J = 100

# dimensionality of our feature vectors
DIM = ...

# number of discrete time steps in the simulation (for simplicity, suppose 1 transaction occurs per time step)
T = 200000


def simulation():
	"""
	This would implement a full simulation of a "real-world" system
	"""

	# our regression tree starts with just a root node
	root = Node(none)

	for t in range(T):
		transaction = ...	# observe a new transaction
		root.store_transaction(transaction)

		if expert_investigator_available(t):
			# we have an expert investigator available at time t

			# collect all the leaf nodes in tree under root (just root if it has no leaves)
			leaves = collect_leaf_nodes(root)

			# every leaf node will propose a transaction that it contains
			# this may be done randomly, or may be done in a smarter way
			# (for instance, every leaf node may internally sort its own collection
			# of transactions in some sort of priority queue based on its models' predictions)
			candidate_transactions = [leaf.propose_candidate() for leaf in leaves]

			# every transaction has a feature vector
			xs = [transaction.get_feature_vector() for transaction in candidate_transactions]

			# get a random model from every leaf node's collection of models
			# this is where we get exploration in our MAB algorithm; see
			# "Bootstrap Thompson Sampling" in paper
			leaf_models = [randomly_select_one_from(leaf._models) for leaf in leaves]

			# for each selected vector of parameters, also add up all the other
			# models along path to root node. This is the "Tree-Boosting" as described
			# in paper
			#
			# NOTE: from all non-leaf nodes, we're picking the full models rather than
			# also randomly selecting again from partially-trained models in all the internal
			# nodes. This likely means we have relatively little exploration, and other
			# ideas (like also bootstrapping inside the tree) might work better
			boosted_models = [np.copy(model) for model in leaf_models]

			for i in range(len(leaves)):
				for node : leaves[i].path_up_to_and_including_root():
					boosted_models[i] += node._full_model

			# we've already done our exploration through Bootstrap Thompson Sampling
			# (randomly selecting one of multiple different models, each trained on
			# different subsets of data). This means we can now just exploit
			predicted_fraud_probs = [1 / (1 + exp(-np.dot(xs[i], boosted_models[i]))) for i in range(len(xs))]
			predicted_rewards = [predicted_fraud_probs[i] * candidate_transactions[i].amount() for i in range(len(xs))]
			selected_idx = argmax(predicted_rewards)

			# let expert investigate transaction at selected index
			reward = ...	# transaction amount if expert says it's fraudulent, 0 otherwise

			# here we would take a Stochastic Gradient Descent step to update all models
			# on path from root to selected leaf based on the observation (fraud or no fraud?)
			#
			# NOTE: for the leaf node, we'll give each of the models in leaf._models a 50%
			# chance of taking a learning step for this instance, and a 50% chance of not
			# taking a learning step. This is what creates diversity in the different models,
			# and enables us to perform exploration through Bootstrap Thompson Sampling
			#
			# the leaf._full_model is trained on 100% of the observed data, and used for some
			# other purposes (like Semi-Supervised Learning)

			# here we would also remove the transaction from the tree; it has been investigated,
			# so will never have to investigate it again

			# the observed reward may also be used by the FIMT-DD algorithm (incremental
			# Regression Tree learner) to create a new split. It will basically compute the
			# best split and second-best split possible (in terms of Standard Deviation Reduction
			# on observed/labelled data so far), and perform the best split if it is judged to
			# truly be the best with a confidence level of (1 - delta) (or if the split simply
			# is quite good, to make sure we don't get stuck when best and second best splits
			# are always equally good).
			#
			# For details, see original FIMT-DD publication(s) cited in our paper


class Node:
	"""
	Node class for our regression tree
	"""

	def __init__(self, parent):
		"""
		Constructor

		:param parent:
			Parent node of this Node (None for root)
		"""
		self._parent = parent

		# create models to be trained on bootstraps of data
		self._models = [np.zeros(DIM) for _ in range(J)]

		# also create a single full model to be trained on all data
		self._full_model = np.zeros(DIM)

		# no children for now
		self._left = None
		self._right = None

		# list in which we'll store uninvestigated transactions
		self._stored_transactions = []


	def child_for_feature_vector(self, feature_vector):
		"""
		This should be implemented to return either self._left or self._right,
		depending on the split condition in this node and the given feature vector

		:param feature_vector:
			Feature vector on which to base child selection

		:return:
			Child (self._left or self._right)
		"""
		pass


	def is_leaf(self):
		return (self._left is None and self._right is None)


	def store_transaction(self, transaction):
		"""
		Sorts the given transaction into the tree rooted in this node
		and stores it.

		:param transaction:
			Transaction to be stored
		"""
		if is_leaf():
			self._stored_transactions.append(transaction)

			if len(self._stored_transactions) % 50 == 0:
				# we'll try to perform a Semi-Supervised Learning split here,
				# as described in paper; use our learned model (the self._full_model one)
				# to make predictions, pretend that those predictions are true labels,
				# and try to split based on that.
				#
				# we could also use any other condition here, the condition is fairly
				# arbitrary. We just want to do this "occasionally".
		else:
			x = transaction.get_feature_vector()
			child_for_feature_vector(x).store_transaction(transaction)
	"""
	This file contains some very high-level pseudocode for the paper:

	Dennis J.N.J. Soemers, Tim Brys, Kurt Driessens, Mark H.M. Winands, and Ann Nowé (2018).
	“Adapting to Concept Drift in Credit Card Transaction Data Streams Using Contextual Bandits and Decision Trees”.
	In Thirtieth Annual Conference on Innovative Applications of Artificial Intelligence (IAAI-18), pp. 7831-7836. AAAI Press.

	Paper link: https://aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16183

	Note that this pseudocode was not published simultaneously with the paper, but written a while later
	due to requests for source code / pseudocode.

	This pseudocode is written in a .py file primarily to get nice syntax highlighting, but it is
	by no means directly-runnable source code.

	When in doubt, trust what's written in the paper over this (this was written quite a while later,
	from memory). If anything looks wrong, or for any other questions, please feel free to contact me:

	@author Dennis Soemers ( d<DOT>soemers<AT>gmail<DOT>com )
	"""


	import numpy as np

	'''
	Some hyperparameters
	'''

	# Number of models per node to train on different bootstraps of data (see Algorithm 1 in paper)
	J = 100

	# dimensionality of our feature vectors
	DIM = ...

	# number of discrete time steps in the simulation (for simplicity, suppose 1 transaction occurs per time step)
	T = 200000


	def simulation():
	"""
	This would implement a full simulation of a "real-world" system
	"""

	# our regression tree starts with just a root node
	root = Node(none)

	for t in range(T):
	transaction = ... # observe a new transaction
	root.store_transaction(transaction)

	if expert_investigator_available(t):
	# we have an expert investigator available at time t

	# collect all the leaf nodes in tree under root (just root if it has no leaves)
	leaves = collect_leaf_nodes(root)

	# every leaf node will propose a transaction that it contains
	# this may be done randomly, or may be done in a smarter way
	# (for instance, every leaf node may internally sort its own collection
	# of transactions in some sort of priority queue based on its models' predictions)
	candidate_transactions = [leaf.propose_candidate() for leaf in leaves]

	# every transaction has a feature vector
	xs = [transaction.get_feature_vector() for transaction in candidate_transactions]

	# get a random model from every leaf node's collection of models
	# this is where we get exploration in our MAB algorithm; see
	# "Bootstrap Thompson Sampling" in paper
	leaf_models = [randomly_select_one_from(leaf._models) for leaf in leaves]

	# for each selected vector of parameters, also add up all the other
	# models along path to root node. This is the "Tree-Boosting" as described
	# in paper
	#
	# NOTE: from all non-leaf nodes, we're picking the full models rather than
	# also randomly selecting again from partially-trained models in all the internal
	# nodes. This likely means we have relatively little exploration, and other
	# ideas (like also bootstrapping inside the tree) might work better
	boosted_models = [np.copy(model) for model in leaf_models]

	for i in range(len(leaves)):
	for node : leaves[i].path_up_to_and_including_root():
	boosted_models[i] += node._full_model

	# we've already done our exploration through Bootstrap Thompson Sampling
	# (randomly selecting one of multiple different models, each trained on
	# different subsets of data). This means we can now just exploit
	predicted_fraud_probs = [1 / (1 + exp(-np.dot(xs[i], boosted_models[i]))) for i in range(len(xs))]
	predicted_rewards = [predicted_fraud_probs[i] * candidate_transactions[i].amount() for i in range(len(xs))]
	selected_idx = argmax(predicted_rewards)

	# let expert investigate transaction at selected index
	reward = ... # transaction amount if expert says it's fraudulent, 0 otherwise

	# here we would take a Stochastic Gradient Descent step to update all models
	# on path from root to selected leaf based on the observation (fraud or no fraud?)
	#
	# NOTE: for the leaf node, we'll give each of the models in leaf._models a 50%
	# chance of taking a learning step for this instance, and a 50% chance of not
	# taking a learning step. This is what creates diversity in the different models,
	# and enables us to perform exploration through Bootstrap Thompson Sampling
	#
	# the leaf._full_model is trained on 100% of the observed data, and used for some
	# other purposes (like Semi-Supervised Learning)

	# here we would also remove the transaction from the tree; it has been investigated,
	# so will never have to investigate it again

	# the observed reward may also be used by the FIMT-DD algorithm (incremental
	# Regression Tree learner) to create a new split. It will basically compute the
	# best split and second-best split possible (in terms of Standard Deviation Reduction
	# on observed/labelled data so far), and perform the best split if it is judged to
	# truly be the best with a confidence level of (1 - delta) (or if the split simply
	# is quite good, to make sure we don't get stuck when best and second best splits
	# are always equally good).
	#
	# For details, see original FIMT-DD publication(s) cited in our paper


	class Node:
	"""
	Node class for our regression tree
	"""

	def __init__(self, parent):
	"""
	Constructor

	:param parent:
	Parent node of this Node (None for root)
	"""
	self._parent = parent

	# create models to be trained on bootstraps of data
	self._models = [np.zeros(DIM) for _ in range(J)]

	# also create a single full model to be trained on all data
	self._full_model = np.zeros(DIM)

	# no children for now
	self._left = None
	self._right = None

	# list in which we'll store uninvestigated transactions
	self._stored_transactions = []


	def child_for_feature_vector(self, feature_vector):
	"""
	This should be implemented to return either self._left or self._right,
	depending on the split condition in this node and the given feature vector

	:param feature_vector:
	Feature vector on which to base child selection

	:return:
	Child (self._left or self._right)
	"""
	pass


	def is_leaf(self):
	return (self._left is None and self._right is None)


	def store_transaction(self, transaction):
	"""
	Sorts the given transaction into the tree rooted in this node
	and stores it.

	:param transaction:
	Transaction to be stored
	"""
	if is_leaf():
	self._stored_transactions.append(transaction)

	if len(self._stored_transactions) % 50 == 0:
	# we'll try to perform a Semi-Supervised Learning split here,
	# as described in paper; use our learned model (the self._full_model one)
	# to make predictions, pretend that those predictions are true labels,
	# and try to split based on that.
	#
	# we could also use any other condition here, the condition is fairly
	# arbitrary. We just want to do this "occasionally".
	else:
	x = transaction.get_feature_vector()
	child_for_feature_vector(x).store_transaction(transaction)