Skip to content

Instantly share code, notes, and snippets.

@tsmcalister
Created December 15, 2020 13:34
Show Gist options
  • Save tsmcalister/e6c552a9b01a9f18a5311ff33000756b to your computer and use it in GitHub Desktop.
Save tsmcalister/e6c552a9b01a9f18a5311ff33000756b to your computer and use it in GitHub Desktop.
Cuda Haversine Distance Calculator for Santa's Stolen Sleigh
import cupy as cp # needs to match your cuda toolkit version (See https://docs.cupy.dev/en/stable/install.html#installing-cupy)
import numpy as np
from pandas import DataFrame
class CupyEvaluator:
def __init__(self, santa_df: DataFrame, sleigh_weight=10):
raw = santa_df.to_numpy()
# add 0 padding for indexing (GiftId starts at 1)
raw = np.vstack((np.zeros(raw.shape[1]), raw))
# add sleigh weight to northpole position
raw[0][1] = 90 # northpole lat
raw[0][2] = 0 # northpole lon
raw[0][3] = sleigh_weight
# drop GiftId Columnn (since it is implicit in the ordering)
self.dataset = raw[:,1:]
self.cupy_kernel = self._init_cupy_kernel()
def _init_cupy_kernel(self):
return cp.RawKernel(r'''
// adjusted from: https://developer.nvidia.com/blog/fast-great-circle-distance-calculation-cuda-c/
extern "C" __global__
void weighted_haversine_distance(const double* lat1, const double* lon1, const double* lat2, const double* lon2, const double* weight, double* result) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
double dlat, dlon, c1, c2, d1, d2, a, c, t, radius;
radius = 6371.0088;
c1 = cospi (lat1[tid]/180.0);
c2 = cospi (lat2[tid]/180.0);
dlat = lat2[tid] - lat1[tid];
dlon = lon2[tid] - lon1[tid];
d1 = sinpi (dlat/360.0);
d2 = sinpi (dlon/360.0);
t = d2 * d2 * c1 * c2;
a = d1 * d1 + t;
c = 2.0 * asin (fmin(1.0, sqrt (a)));
result[tid] = radius* c * weight[tid];
}
''', 'weighted_haversine_distance')
def weighted_reindeer_weariness(self, instanced):
# do some reshaping to allow for better parallelisation
block_size = 512
thread_size= int(np.ceil(instanced.shape[0] / 512))
padding_size = thread_size * 512 - instanced.shape[0]
padding = np.zeros((padding_size, instanced.shape[1]))
padded = np.concatenate((instanced, padding)).reshape((block_size, thread_size, -1))
# send to gpu
lat1 = cp.asarray(padded[:, :, 0])
lon1 = cp.asarray(padded[:, :, 1])
lat2 = cp.asarray(padded[:, :, 2])
lon2 = cp.asarray(padded[:, :, 3])
cum_weight = cp.asarray(padded[:, :, 4])
out = cp.zeros((block_size, thread_size))
# send through kernel
self.cupy_kernel((block_size,), (thread_size,), (lat1, lon1, lat2, lon2, cum_weight, out))
# sum and return to cpu
return cp.asnumpy(out.sum())
def eval_trips_gpu(self, trip_list: np.ndarray) -> float:
"""
Does preprocessing on the CPU.
Then uses a GPU Kernel to parallelise weighted distance calculations
:param ordered_gifts: 1d nd array of gift_ids representing a path through the nodes i->i+1 the north_pole is represented by id 0
:return: sum of weighted reindeer weariness of trips
"""
zero_idx, = np.nonzero(trip_list==0)
trip_idx = zero_idx.reshape((-1, 1))
trip_slices = np.concatenate((trip_idx, np.roll(trip_idx, -1, 0)), axis=1)[:-1]
permuted = self.dataset[trip_list]
for trip_slice in trip_slices:
permuted[trip_slice[0]+1:trip_slice[1]+1, 2] = np.cumsum(permuted[trip_slice[0]+1:trip_slice[1]+1, 2][::-1])[::-1]
instanced = np.concatenate((np.roll(permuted, 1, 0)[:, :2], permuted), axis=1)
return self.weighted_reindeer_weariness(instanced[1:])
@staticmethod
def submission_to_trip_lists(submission: DataFrame):
trip = []
for trip_id in submission['TripId'].unique():
trip.append(0)
for gift in submission[submission['TripId'] == trip_id]['GiftId']:
trip.append(gift)
trip.append(0)
return np.array(trip)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment