tsmcalister/cupy_eval.py

## cupy_eval.py
import cupy as cp # needs to match your cuda toolkit version (See https://docs.cupy.dev/en/stable/install.html#installing-cupy)
import numpy as np
from pandas import DataFrame

class CupyEvaluator:
    def __init__(self, santa_df: DataFrame, sleigh_weight=10):
        raw = santa_df.to_numpy()
        # add 0 padding for indexing (GiftId starts at 1)
        raw = np.vstack((np.zeros(raw.shape[1]), raw))
        # add sleigh weight to northpole position
        raw[0][1] = 90 # northpole lat
        raw[0][2] = 0 # northpole lon
        raw[0][3] = sleigh_weight
        # drop GiftId Columnn (since it is implicit in the ordering)
        self.dataset = raw[:,1:]
        self.cupy_kernel = self._init_cupy_kernel()

    def _init_cupy_kernel(self):
        return cp.RawKernel(r'''
// adjusted from: https://developer.nvidia.com/blog/fast-great-circle-distance-calculation-cuda-c/
extern "C" __global__
void weighted_haversine_distance(const double* lat1, const double* lon1, const double* lat2, const double* lon2, const double* weight, double* result) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    double dlat, dlon, c1, c2, d1, d2, a, c, t, radius;
    radius = 6371.0088;
    c1 = cospi (lat1[tid]/180.0);
    c2 = cospi (lat2[tid]/180.0);
    dlat = lat2[tid] - lat1[tid];
    dlon = lon2[tid] - lon1[tid];
    d1 = sinpi (dlat/360.0);
    d2 = sinpi (dlon/360.0);
    t = d2 * d2 * c1 * c2;
    a = d1 * d1 + t;
    c = 2.0 * asin (fmin(1.0, sqrt (a)));
    result[tid] = radius* c * weight[tid];
}
        ''', 'weighted_haversine_distance')


    def weighted_reindeer_weariness(self, instanced):
        # do some reshaping to allow for better parallelisation
        block_size = 512
        thread_size= int(np.ceil(instanced.shape[0] / 512))
        padding_size = thread_size * 512 - instanced.shape[0]
        padding = np.zeros((padding_size, instanced.shape[1]))
        padded = np.concatenate((instanced, padding)).reshape((block_size, thread_size, -1))

        # send to gpu
        lat1 = cp.asarray(padded[:, :, 0])
        lon1 = cp.asarray(padded[:, :, 1])
        lat2 = cp.asarray(padded[:, :, 2])
        lon2 = cp.asarray(padded[:, :, 3])
        cum_weight = cp.asarray(padded[:, :, 4])
        out = cp.zeros((block_size, thread_size))

        # send through kernel
        self.cupy_kernel((block_size,), (thread_size,), (lat1, lon1, lat2, lon2, cum_weight, out))

        # sum and return to cpu
        return cp.asnumpy(out.sum())

    def eval_trips_gpu(self, trip_list: np.ndarray) -> float:
        """
        Does preprocessing on the CPU.
        Then uses a GPU Kernel to parallelise weighted distance calculations
        :param ordered_gifts: 1d nd array of gift_ids representing a path through the nodes i->i+1 the north_pole is represented by id 0
        :return: sum of weighted reindeer weariness of trips
        """
        zero_idx, = np.nonzero(trip_list==0)
        trip_idx = zero_idx.reshape((-1, 1))
        trip_slices = np.concatenate((trip_idx, np.roll(trip_idx, -1, 0)), axis=1)[:-1]

        permuted = self.dataset[trip_list]
        for trip_slice in trip_slices:
            permuted[trip_slice[0]+1:trip_slice[1]+1, 2] = np.cumsum(permuted[trip_slice[0]+1:trip_slice[1]+1, 2][::-1])[::-1]

        instanced = np.concatenate((np.roll(permuted, 1, 0)[:, :2], permuted), axis=1)

        return self.weighted_reindeer_weariness(instanced[1:])

    @staticmethod
    def submission_to_trip_lists(submission: DataFrame):
        trip = []
        for trip_id in submission['TripId'].unique():
            trip.append(0)
            for gift in submission[submission['TripId'] == trip_id]['GiftId']:
                trip.append(gift)
        trip.append(0)
        return np.array(trip)
	import cupy as cp # needs to match your cuda toolkit version (See https://docs.cupy.dev/en/stable/install.html#installing-cupy)
	import numpy as np
	from pandas import DataFrame

	class CupyEvaluator:
	def __init__(self, santa_df: DataFrame, sleigh_weight=10):
	raw = santa_df.to_numpy()
	# add 0 padding for indexing (GiftId starts at 1)
	raw = np.vstack((np.zeros(raw.shape[1]), raw))
	# add sleigh weight to northpole position
	raw[0][1] = 90 # northpole lat
	raw[0][2] = 0 # northpole lon
	raw[0][3] = sleigh_weight
	# drop GiftId Columnn (since it is implicit in the ordering)
	self.dataset = raw[:,1:]
	self.cupy_kernel = self._init_cupy_kernel()

	def _init_cupy_kernel(self):
	return cp.RawKernel(r'''
	// adjusted from: https://developer.nvidia.com/blog/fast-great-circle-distance-calculation-cuda-c/
	extern "C" __global__
	void weighted_haversine_distance(const double* lat1, const double* lon1, const double* lat2, const double* lon2, const double* weight, double* result) {
	int tid = blockDim.x * blockIdx.x + threadIdx.x;
	double dlat, dlon, c1, c2, d1, d2, a, c, t, radius;
	radius = 6371.0088;
	c1 = cospi (lat1[tid]/180.0);
	c2 = cospi (lat2[tid]/180.0);
	dlat = lat2[tid] - lat1[tid];
	dlon = lon2[tid] - lon1[tid];
	d1 = sinpi (dlat/360.0);
	d2 = sinpi (dlon/360.0);
	t = d2 * d2 * c1 * c2;
	a = d1 * d1 + t;
	c = 2.0 * asin (fmin(1.0, sqrt (a)));
	result[tid] = radius* c * weight[tid];
	}
	''', 'weighted_haversine_distance')


	def weighted_reindeer_weariness(self, instanced):
	# do some reshaping to allow for better parallelisation
	block_size = 512
	thread_size= int(np.ceil(instanced.shape[0] / 512))
	padding_size = thread_size * 512 - instanced.shape[0]
	padding = np.zeros((padding_size, instanced.shape[1]))
	padded = np.concatenate((instanced, padding)).reshape((block_size, thread_size, -1))

	# send to gpu
	lat1 = cp.asarray(padded[:, :, 0])
	lon1 = cp.asarray(padded[:, :, 1])
	lat2 = cp.asarray(padded[:, :, 2])
	lon2 = cp.asarray(padded[:, :, 3])
	cum_weight = cp.asarray(padded[:, :, 4])
	out = cp.zeros((block_size, thread_size))

	# send through kernel
	self.cupy_kernel((block_size,), (thread_size,), (lat1, lon1, lat2, lon2, cum_weight, out))

	# sum and return to cpu
	return cp.asnumpy(out.sum())

	def eval_trips_gpu(self, trip_list: np.ndarray) -> float:
	"""
	Does preprocessing on the CPU.
	Then uses a GPU Kernel to parallelise weighted distance calculations
	:param ordered_gifts: 1d nd array of gift_ids representing a path through the nodes i->i+1 the north_pole is represented by id 0
	:return: sum of weighted reindeer weariness of trips
	"""
	zero_idx, = np.nonzero(trip_list==0)
	trip_idx = zero_idx.reshape((-1, 1))
	trip_slices = np.concatenate((trip_idx, np.roll(trip_idx, -1, 0)), axis=1)[:-1]

	permuted = self.dataset[trip_list]
	for trip_slice in trip_slices:
	permuted[trip_slice[0]+1:trip_slice[1]+1, 2] = np.cumsum(permuted[trip_slice[0]+1:trip_slice[1]+1, 2][::-1])[::-1]

	instanced = np.concatenate((np.roll(permuted, 1, 0)[:, :2], permuted), axis=1)

	return self.weighted_reindeer_weariness(instanced[1:])

	@staticmethod
	def submission_to_trip_lists(submission: DataFrame):
	trip = []
	for trip_id in submission['TripId'].unique():
	trip.append(0)
	for gift in submission[submission['TripId'] == trip_id]['GiftId']:
	trip.append(gift)
	trip.append(0)
	return np.array(trip)