Skip to content

Instantly share code, notes, and snippets.

@ian-ross
Last active August 29, 2015 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ian-ross/6284e75dab0a9b28bd6d to your computer and use it in GitHub Desktop.
Save ian-ross/6284e75dab0a9b28bd6d to your computer and use it in GitHub Desktop.
Haskell data analysis: CUDA speedups for KDE
module Main where
import Numeric
import Control.Applicative ((<$>))
import System.FilePath
import Data.NetCDF
import Data.NetCDF.HMatrix
import Data.NetCDF.Vector
import Foreign.C
import qualified Data.Map as M
import qualified Data.Vector.Storable as SV
import qualified Data.Vector.Storable.Mutable as SVM
import Numeric.LinearAlgebra.HMatrix
import qualified Data.ByteString.Char8 as B
import qualified Foreign.CUDA.Driver as CUDA
type HMatrixRet a = IO (Either NcError (HMatrix a))
basedir, workdir :: FilePath
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive"
workdir = basedir </> "spherical-pdf"
type V = SV.Vector Double
-- Use a regular 2.5 deg. x 2.5 deg. theta/phi grid on unit sphere.
ntheta, nphi :: Int
nphi = 2 * 360 `div` 5 ; ntheta = 2 * 180 `div` 5
-- Integration steps.
dtheta, dphi :: Double
dphi = 2.0 * pi / fromIntegral nphi ; dtheta = pi / fromIntegral ntheta
main :: IO ()
main = do
-- CUDA initialisation.
CUDA.initialise []
dev <- CUDA.device 0
ctx <- CUDA.create dev []
ptx <- B.readFile "make_pdf.ptx"
CUDA.JITResult time log mdl <- CUDA.loadDataEx ptx []
fun <- CUDA.getFun mdl "make_pdf_kernel"
-- Open projected points data file for input.
Right innc <- openFile $ workdir </> "sphere-proj.nc"
let Just ntime = ncDimLength <$> ncDim innc "time"
let (Just projvar) = ncVar innc "proj"
Right (HMatrix projsin) <-
getA innc projvar [0, 0] [ntime, 2] :: HMatrixRet CDouble
-- Convert projections to 3-D points, flatten to vector and allocate
-- on device.
let nData = rows projsin
dataPts = SV.concat $ map projToPt $ toRows $ cmap realToFrac projsin
dDataPts <- CUDA.mallocArray $ 3 * nData
SV.unsafeWith dataPts $ \p -> CUDA.pokeArray (3 * nData) p dDataPts
-- Calculate kernel values for each grid point/data point
-- combination and accumulate into grid.
dPdf <- CUDA.mallocArray $ ntheta * nphi :: IO (CUDA.DevicePtr Double)
let tx = 32 ; ty = 16
blockSize = (tx, ty, 1)
gridSize = ((nphi - 1) `div` tx + 1, (ntheta - 1) `div` ty + 1, 1)
CUDA.launchKernel fun gridSize blockSize 0 Nothing
[CUDA.IArg (fromIntegral nData), CUDA.VArg dDataPts, CUDA.VArg dPdf]
CUDA.sync
res <- SVM.new (ntheta * nphi)
SVM.unsafeWith res $ \p -> CUDA.peekArray (ntheta * nphi) dPdf p
unnormv <- SV.unsafeFreeze res
let unnorm = reshape nphi unnormv
-- Normalise.
let ths = [(0.5 + fromIntegral i) * dtheta | i <- [0..ntheta-1] ]
phs = [fromIntegral i * dphi | i <- [0..nphi-1] ]
sinths = map sin ths
doone x v = sumElements $ cmap (x *) v
int = dtheta * dphi * sum (zipWith doone sinths (toRows unnorm))
norm = cmap (realToFrac . (/ int)) unnorm :: Matrix CDouble
norm1 = cmap ((/ int)) unnorm
tst = dtheta * dphi * sum (zipWith doone sinths (toRows norm1))
-- Set up output file.
let outthdim = NcDim "theta" ntheta False
outthvar = NcVar "theta" NcDouble [outthdim] M.empty
outphdim = NcDim "phi" nphi False
outphvar = NcVar "phi" NcDouble [outphdim] M.empty
outpdfvar = NcVar "pdf" NcDouble [outthdim, outphdim] M.empty
outncinfo =
emptyNcInfo (workdir </> "sphere-pdf.nc") #
addNcDim outthdim # addNcDim outphdim #
addNcVar outthvar # addNcVar outphvar #
addNcVar outpdfvar
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $
\outnc -> do
-- Write coordinate variable values.
put outnc outthvar $
(SV.fromList $ map realToFrac ths :: SV.Vector CDouble)
put outnc outphvar $
(SV.fromList $ map realToFrac phs :: SV.Vector CDouble)
put outnc outpdfvar $ HMatrix norm
return ()
-- Convert (theta, phi) coordinates to 3-D unit vector.
projToPt :: Vector Double -> V
projToPt p = sphPt (p ! 0) (p ! 1)
-- Convert (theta, phi) coordinates to 3-D unit vector.
sphPt :: Double -> Double -> V
sphPt th ph = SV.fromList [sin th * cos ph, sin th * sin ph, cos th]
#include <cuda.h>
#include <cmath>
// Use a regular 2.5 deg. x 2.5 deg. theta/phi grid on unit sphere.
const int Nphi = 2 * 360 / 5, Ntheta = 2 * 180 / 5;
// Integration steps.
const double dphi = 2.0 * M_PI / Nphi, dtheta = M_PI / Ntheta;
// Density kernel bandwidth.
const double bandwidth = M_PI / 6.0;
extern "C"
__global__ void make_pdf_kernel
(unsigned int D, const double * __restrict__ d_data, double *d_pdf)
{
unsigned int c = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int r = blockIdx.y * blockDim.y + threadIdx.y;
double th = (0.5 + r) * dtheta, ph = c * dphi;
double gx = sin(th) * cos(ph), gy = sin(th) * sin(ph), gz = cos(th);
if (r > Ntheta || c > Nphi) return;
double sum = 0.0;
for (unsigned int i = 0; i < D; ++i) {
double dx = d_data[3 * i], dy = d_data[3 * i + 1], dz = d_data[3 * i + 2];
double u = acos(dx * gx + dy * gy + dz * gz) / bandwidth;
if (u < 1) sum += 1 - u * u;
}
d_pdf[r * Nphi + c] = sum;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment