bnyeggen/geoull.cl

## geoull.cl
/*
  OpenCL code for a geospatial recommender.  The underlying algorithm is:
  - Each user has a set of XY points
  - Similarity between user A and user B is defined as the inverse mean minimum distance to each of user A's points, from any of user B's points.  This is not symmetric.  E.g., if user A has points at (0,0) and (10,10), and user B has a point at (4,4), similarity of B to A will be 1 / ((sqrt(4^2 + 4^2) + sqrt(6^2 + 6^2)) / 2), and similarity of A to B will be 1 / (sqrt(4^2 + 4^2)).  It is possible to define other similarity measures along the same lines (e.g. inverse mean minimum squared distance) that will give different results.  We're all about the heuristics.
  - For evaluating the predicted strength at a new XY location for a given main user,
    - Calculate an inverse-square dropoff from every point (as if one was calculating gravity in an N-body simulation).  Again, one could use some other exponent with better empirical foundation.  This can be optimized by excluding points beyond a certain distance.
    - Sum these "forces" in proportion to the similarities of the associated users, to the main user in question.  This can be optimized by excluding points associated with users below a certain threshold similarity.

Beyond "looks good" heuristics, one can use a quasi-EM algorithm to derive the exponents by looking at what gives the smallest deviation between predictions and reality.

  Similarity calculation is the intensive part; for evaluating predicted recommendation strength a simple geospatial index for finding the right points & pre-calculated similarities gives you great performance.  Fortunately, the similarity calculation is essentially round-robin & parallelizable, albeit with some data-dependent loop sizes, so we use OpenCL.  Each thread processes a single user, and a work-group consists of a bundle of users.  Work-groups / work-items are 1D.
*/

__kernel void calculate_similarities(//Number of points for the main user
                                     const int n_main_user_points,
                                     //Location of the main user's XY coordinate pairs in constant memory
                                     __constant const float2* main_user_data,
                                     //Number of total users (not users' points)
                                     const int n_total_users,
                                     //Location of users' POI slice info - the first coordinate is an offset, and the second is a length, into poi_data
                                     __global const uint2* slice_info,
                                     //All users' XY coordinate pairs
                                     __global const float2* poi_data,
                                     //Output - will store mean minimum distance between a particular user & the main user, in the same order as slice_info
                                     __global const float* similarities,
                                     //Cache for minimum distance to one of the main user's points, later averaged.  Arranged
                                     //[user0/main_pt0 user1/main_pt0 user2/main_pt0 ...padding... user0/main_pt1 user1/main_pt1 user2/main_pt1 ... ]
                                     __local const float* distance_cache){

	int lid = get_local_id(0);
	//Round up to nearest multiple of 32 to avoid bank conflicts in shared memory.
	int loc_size_32 = (get_local_size(0) + 31) / 32 * 32;
	//Invalidate local cache by setting to the max finite 32-bit floating point value.
	for(int i=0; i<n_main_user_points; i++)	distance_cache[lid + i * loc_size_32] = 0x1.fffffeP+127f;
	uint2 this_user_data = slice_info[get_global_id(0)];

	//Data dependent loop here, unfortunately
	//But later iterations in threads with short data should be no-ops, not true divergence
	//One could try to get minimum variation in loop sizes by shuffling the users so each work-group contains a random sample
	//Each iteration (ie, for each 2 of this user's points) is 1 coalesced memory access
	for(int j=0; j<this_user_data.s1; j+=2){
		//Grab 4 floats total of data to fill a 128 bit memory request
		float2 p1 = poi_data[this_user_data.s0 + j];
		float2 p2 = (j+1<this_user_data.s1) ? poi_data[this_user_data.s0 + j + 1] : (float2)(0.0f, 0.0f);
		//Per main user point, this does 4 differences, 4 mults, 2 adds, 2 sqrts, 2 min's
		for(int i=0; i<n_main_user_points; i++){
			int dist_cache_loc = lid + i * loc_size_32;
			float d1 = distance(main_user_data[i], p1);
			float d2 = (j+1<this_user_data.s1) ? distance(main_user_data[i], p2) : 0x1.fffffeP+127f;
			float new_min = fmin(distance_cache[dist_cache_loc], d1);
			distance_cache[dist_cache_loc] = fmin(new_min, d2);
		}
	}

	//Sync to let other threads catch up - not necessary to avoid memory conflicts, but avoids true divergence
	barrier(CLK_LOCAL_MEM_FENCE)

	float mean = dist_cache_loc[lid];
	for(int i=1; i < n_main_user_points; i++) mean += dist_cache_loc[lid + i * loc_size_32];
	//Write inverse mean as output - these writes should be coalesced.
	distances[get_global_id(0)] = n_main_user_points / mean;

}
	/*
	OpenCL code for a geospatial recommender. The underlying algorithm is:
	- Each user has a set of XY points
	- Similarity between user A and user B is defined as the inverse mean minimum distance to each of user A's points, from any of user B's points. This is not symmetric. E.g., if user A has points at (0,0) and (10,10), and user B has a point at (4,4), similarity of B to A will be 1 / ((sqrt(4^2 + 4^2) + sqrt(6^2 + 6^2)) / 2), and similarity of A to B will be 1 / (sqrt(4^2 + 4^2)). It is possible to define other similarity measures along the same lines (e.g. inverse mean minimum squared distance) that will give different results. We're all about the heuristics.
	- For evaluating the predicted strength at a new XY location for a given main user,
	- Calculate an inverse-square dropoff from every point (as if one was calculating gravity in an N-body simulation). Again, one could use some other exponent with better empirical foundation. This can be optimized by excluding points beyond a certain distance.
	- Sum these "forces" in proportion to the similarities of the associated users, to the main user in question. This can be optimized by excluding points associated with users below a certain threshold similarity.

	Beyond "looks good" heuristics, one can use a quasi-EM algorithm to derive the exponents by looking at what gives the smallest deviation between predictions and reality.

	Similarity calculation is the intensive part; for evaluating predicted recommendation strength a simple geospatial index for finding the right points & pre-calculated similarities gives you great performance. Fortunately, the similarity calculation is essentially round-robin & parallelizable, albeit with some data-dependent loop sizes, so we use OpenCL. Each thread processes a single user, and a work-group consists of a bundle of users. Work-groups / work-items are 1D.
	*/

	__kernel void calculate_similarities(//Number of points for the main user
	const int n_main_user_points,
	//Location of the main user's XY coordinate pairs in constant memory
	__constant const float2* main_user_data,
	//Number of total users (not users' points)
	const int n_total_users,
	//Location of users' POI slice info - the first coordinate is an offset, and the second is a length, into poi_data
	__global const uint2* slice_info,
	//All users' XY coordinate pairs
	__global const float2* poi_data,
	//Output - will store mean minimum distance between a particular user & the main user, in the same order as slice_info
	__global const float* similarities,
	//Cache for minimum distance to one of the main user's points, later averaged. Arranged
	//[user0/main_pt0 user1/main_pt0 user2/main_pt0 ...padding... user0/main_pt1 user1/main_pt1 user2/main_pt1 ... ]
	__local const float* distance_cache){

	int lid = get_local_id(0);
	//Round up to nearest multiple of 32 to avoid bank conflicts in shared memory.
	int loc_size_32 = (get_local_size(0) + 31) / 32 * 32;
	//Invalidate local cache by setting to the max finite 32-bit floating point value.
	for(int i=0; i<n_main_user_points; i++) distance_cache[lid + i * loc_size_32] = 0x1.fffffeP+127f;
	uint2 this_user_data = slice_info[get_global_id(0)];

	//Data dependent loop here, unfortunately
	//But later iterations in threads with short data should be no-ops, not true divergence
	//One could try to get minimum variation in loop sizes by shuffling the users so each work-group contains a random sample
	//Each iteration (ie, for each 2 of this user's points) is 1 coalesced memory access
	for(int j=0; j<this_user_data.s1; j+=2){
	//Grab 4 floats total of data to fill a 128 bit memory request
	float2 p1 = poi_data[this_user_data.s0 + j];
	float2 p2 = (j+1<this_user_data.s1) ? poi_data[this_user_data.s0 + j + 1] : (float2)(0.0f, 0.0f);
	//Per main user point, this does 4 differences, 4 mults, 2 adds, 2 sqrts, 2 min's
	for(int i=0; i<n_main_user_points; i++){
	int dist_cache_loc = lid + i * loc_size_32;
	float d1 = distance(main_user_data[i], p1);
	float d2 = (j+1<this_user_data.s1) ? distance(main_user_data[i], p2) : 0x1.fffffeP+127f;
	float new_min = fmin(distance_cache[dist_cache_loc], d1);
	distance_cache[dist_cache_loc] = fmin(new_min, d2);
	}
	}

	//Sync to let other threads catch up - not necessary to avoid memory conflicts, but avoids true divergence
	barrier(CLK_LOCAL_MEM_FENCE)

	float mean = dist_cache_loc[lid];
	for(int i=1; i < n_main_user_points; i++) mean += dist_cache_loc[lid + i * loc_size_32];
	//Write inverse mean as output - these writes should be coalesced.
	distances[get_global_id(0)] = n_main_user_points / mean;

	}