Created
December 15, 2020 13:34
-
-
Save tsmcalister/e6c552a9b01a9f18a5311ff33000756b to your computer and use it in GitHub Desktop.
Cuda Haversine Distance Calculator for Santa's Stolen Sleigh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cupy as cp # needs to match your cuda toolkit version (See https://docs.cupy.dev/en/stable/install.html#installing-cupy) | |
import numpy as np | |
from pandas import DataFrame | |
class CupyEvaluator: | |
def __init__(self, santa_df: DataFrame, sleigh_weight=10): | |
raw = santa_df.to_numpy() | |
# add 0 padding for indexing (GiftId starts at 1) | |
raw = np.vstack((np.zeros(raw.shape[1]), raw)) | |
# add sleigh weight to northpole position | |
raw[0][1] = 90 # northpole lat | |
raw[0][2] = 0 # northpole lon | |
raw[0][3] = sleigh_weight | |
# drop GiftId Columnn (since it is implicit in the ordering) | |
self.dataset = raw[:,1:] | |
self.cupy_kernel = self._init_cupy_kernel() | |
def _init_cupy_kernel(self): | |
return cp.RawKernel(r''' | |
// adjusted from: https://developer.nvidia.com/blog/fast-great-circle-distance-calculation-cuda-c/ | |
extern "C" __global__ | |
void weighted_haversine_distance(const double* lat1, const double* lon1, const double* lat2, const double* lon2, const double* weight, double* result) { | |
int tid = blockDim.x * blockIdx.x + threadIdx.x; | |
double dlat, dlon, c1, c2, d1, d2, a, c, t, radius; | |
radius = 6371.0088; | |
c1 = cospi (lat1[tid]/180.0); | |
c2 = cospi (lat2[tid]/180.0); | |
dlat = lat2[tid] - lat1[tid]; | |
dlon = lon2[tid] - lon1[tid]; | |
d1 = sinpi (dlat/360.0); | |
d2 = sinpi (dlon/360.0); | |
t = d2 * d2 * c1 * c2; | |
a = d1 * d1 + t; | |
c = 2.0 * asin (fmin(1.0, sqrt (a))); | |
result[tid] = radius* c * weight[tid]; | |
} | |
''', 'weighted_haversine_distance') | |
def weighted_reindeer_weariness(self, instanced): | |
# do some reshaping to allow for better parallelisation | |
block_size = 512 | |
thread_size= int(np.ceil(instanced.shape[0] / 512)) | |
padding_size = thread_size * 512 - instanced.shape[0] | |
padding = np.zeros((padding_size, instanced.shape[1])) | |
padded = np.concatenate((instanced, padding)).reshape((block_size, thread_size, -1)) | |
# send to gpu | |
lat1 = cp.asarray(padded[:, :, 0]) | |
lon1 = cp.asarray(padded[:, :, 1]) | |
lat2 = cp.asarray(padded[:, :, 2]) | |
lon2 = cp.asarray(padded[:, :, 3]) | |
cum_weight = cp.asarray(padded[:, :, 4]) | |
out = cp.zeros((block_size, thread_size)) | |
# send through kernel | |
self.cupy_kernel((block_size,), (thread_size,), (lat1, lon1, lat2, lon2, cum_weight, out)) | |
# sum and return to cpu | |
return cp.asnumpy(out.sum()) | |
def eval_trips_gpu(self, trip_list: np.ndarray) -> float: | |
""" | |
Does preprocessing on the CPU. | |
Then uses a GPU Kernel to parallelise weighted distance calculations | |
:param ordered_gifts: 1d nd array of gift_ids representing a path through the nodes i->i+1 the north_pole is represented by id 0 | |
:return: sum of weighted reindeer weariness of trips | |
""" | |
zero_idx, = np.nonzero(trip_list==0) | |
trip_idx = zero_idx.reshape((-1, 1)) | |
trip_slices = np.concatenate((trip_idx, np.roll(trip_idx, -1, 0)), axis=1)[:-1] | |
permuted = self.dataset[trip_list] | |
for trip_slice in trip_slices: | |
permuted[trip_slice[0]+1:trip_slice[1]+1, 2] = np.cumsum(permuted[trip_slice[0]+1:trip_slice[1]+1, 2][::-1])[::-1] | |
instanced = np.concatenate((np.roll(permuted, 1, 0)[:, :2], permuted), axis=1) | |
return self.weighted_reindeer_weariness(instanced[1:]) | |
@staticmethod | |
def submission_to_trip_lists(submission: DataFrame): | |
trip = [] | |
for trip_id in submission['TripId'].unique(): | |
trip.append(0) | |
for gift in submission[submission['TripId'] == trip_id]['GiftId']: | |
trip.append(gift) | |
trip.append(0) | |
return np.array(trip) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment