PWhiddy/stamps.cpp

## stamps.cpp
// original function, generates single trajectory
RawImage KBMOSearch::stackedStamps(trajectory t, int radius, std::vector<RawImage*> imgs)
{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
	RawImage stamp(dim, dim);
	std::vector<float> times = stack.getTimes();
	for (int i=0; i<imgs.size(); ++i)
	{
		for (int x=0; x<dim; ++x)
		{
			for (int y=0; y<dim; ++y)
			{
				float pixVal = imgs[i]->getPixelInterp(
						t.x + times[i] * t.xVel + static_cast<float>(x-radius),
						t.y + times[i] * t.yVel + static_cast<float>(y-radius));
				if (pixVal == NO_DATA) pixVal = 0.0;
				stamp.addToPixel(static_cast<int>(x),static_cast<int>(y), pixVal);
			}
		}
	}
	return stamp;
}

// batch (this is just the general idea jotted down, imagine it as untested psuedo-code. Not tested at all!)
std::vector<RawImage> KBMOSearch::stackedStampsBatch(std::vector<trajectory> t, int radius, std::vector<RawImage*> imgs)
{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
  std::vector<RawImage> stamps;
  stamps.reserve(t.size());
  std::vector<float> times = stack.getTimes();

  #pragma omp parallel for
  for (int s=0; s<t.size(); s++) {
    stamps[i] = RawImage(dim,dim);
    for (int i=0; i<imgs.size(); ++i)
    {
      for (int x=0; x<dim; ++x)
      {
        for (int y=0; y<dim; ++y)
        {
          float pixVal = imgs[i]->getPixelInterp(
              t.x + times[i] * t.xVel + static_cast<float>(x-radius),
              t.y + times[i] * t.yVel + static_cast<float>(y-radius));
          if (pixVal == NO_DATA) pixVal = 0.0;
          stamps[i].addToPixel(static_cast<int>(x),static_cast<int>(y), pixVal);
        }
      }
    }
  }
	return stamps;
}

// gpu batch (also treat this only as a rough guideline)
std::vector<RawImage> KBMOSearch::stackedStampsBatchGPU(std::vector<trajectory> t, int radius, std::vector<RawImage*> imgs)
{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
  std::vector<RawImage> stamps;
  stamps.reserve(t.size());
  std::vector<float> times = stack.getTimes();
  // 1. allocate GPU memory for trajctories, images, and stamps (cudaMalloc)
  // 2. Then copy images and trajectories into that gpu memory (cudaMemcpy)
  // 3. create thread blocks ( if the stamps are larger than 8*8,
  // making each block have the dimensions of the stamp would probably work nice)
  // 4. make a 1D grid that just has one threadblock for each stamp
  // 5. launch a kernel that stacks together a single pixel (a block of these together makes one stamp),
  // and writes the result into the gpu stamp memory allocated earlier
  // stackStampKernel<<<grid,threads>>>(gpuTrajs, gpuImages, gpuStamps, radius, imageDims... imageCount...)
  // 6. copy the stamps back to cpu
  // 7. free gpu memory
  // 8. return results
  // the memory copying could be simpler using an api that handles it automatically, but I haven't used those features before

	return stamps;
}
	// original function, generates single trajectory
	RawImage KBMOSearch::stackedStamps(trajectory t, int radius, std::vector<RawImage*> imgs)
	{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
	RawImage stamp(dim, dim);
	std::vector<float> times = stack.getTimes();
	for (int i=0; i<imgs.size(); ++i)
	{
	for (int x=0; x<dim; ++x)
	{
	for (int y=0; y<dim; ++y)
	{
	float pixVal = imgs[i]->getPixelInterp(
	t.x + times[i] * t.xVel + static_cast<float>(x-radius),
	t.y + times[i] * t.yVel + static_cast<float>(y-radius));
	if (pixVal == NO_DATA) pixVal = 0.0;
	stamp.addToPixel(static_cast<int>(x),static_cast<int>(y), pixVal);
	}
	}
	}
	return stamp;
	}

	// batch (this is just the general idea jotted down, imagine it as untested psuedo-code. Not tested at all!)
	std::vector<RawImage> KBMOSearch::stackedStampsBatch(std::vector<trajectory> t, int radius, std::vector<RawImage*> imgs)
	{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
	std::vector<RawImage> stamps;
	stamps.reserve(t.size());
	std::vector<float> times = stack.getTimes();

	#pragma omp parallel for
	for (int s=0; s<t.size(); s++) {
	stamps[i] = RawImage(dim,dim);
	for (int i=0; i<imgs.size(); ++i)
	{
	for (int x=0; x<dim; ++x)
	{
	for (int y=0; y<dim; ++y)
	{
	float pixVal = imgs[i]->getPixelInterp(
	t.x + times[i] * t.xVel + static_cast<float>(x-radius),
	t.y + times[i] * t.yVel + static_cast<float>(y-radius));
	if (pixVal == NO_DATA) pixVal = 0.0;
	stamps[i].addToPixel(static_cast<int>(x),static_cast<int>(y), pixVal);
	}
	}
	}
	}
	return stamps;
	}

	// gpu batch (also treat this only as a rough guideline)
	std::vector<RawImage> KBMOSearch::stackedStampsBatchGPU(std::vector<trajectory> t, int radius, std::vector<RawImage*> imgs)
	{
	if (radius<0) throw std::runtime_error("stamp radius must be at least 0");
	int dim = radius*2+1;
	std::vector<RawImage> stamps;
	stamps.reserve(t.size());
	std::vector<float> times = stack.getTimes();
	// 1. allocate GPU memory for trajctories, images, and stamps (cudaMalloc)
	// 2. Then copy images and trajectories into that gpu memory (cudaMemcpy)
	// 3. create thread blocks ( if the stamps are larger than 8*8,
	// making each block have the dimensions of the stamp would probably work nice)
	// 4. make a 1D grid that just has one threadblock for each stamp
	// 5. launch a kernel that stacks together a single pixel (a block of these together makes one stamp),
	// and writes the result into the gpu stamp memory allocated earlier
	// stackStampKernel<<<grid,threads>>>(gpuTrajs, gpuImages, gpuStamps, radius, imageDims... imageCount...)
	// 6. copy the stamps back to cpu
	// 7. free gpu memory
	// 8. return results
	// the memory copying could be simpler using an api that handles it automatically, but I haven't used those features before

	return stamps;
	}