Last active
August 29, 2015 14:14
-
-
Save ian-ross/6284e75dab0a9b28bd6d to your computer and use it in GitHub Desktop.
Haskell data analysis: CUDA speedups for KDE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Main where | |
import Numeric | |
import Control.Applicative ((<$>)) | |
import System.FilePath | |
import Data.NetCDF | |
import Data.NetCDF.HMatrix | |
import Data.NetCDF.Vector | |
import Foreign.C | |
import qualified Data.Map as M | |
import qualified Data.Vector.Storable as SV | |
import qualified Data.Vector.Storable.Mutable as SVM | |
import Numeric.LinearAlgebra.HMatrix | |
import qualified Data.ByteString.Char8 as B | |
import qualified Foreign.CUDA.Driver as CUDA | |
type HMatrixRet a = IO (Either NcError (HMatrix a)) | |
basedir, workdir :: FilePath | |
basedir = "/big/project-storage/haskell-data-analysis/atmos-non-diffusive" | |
workdir = basedir </> "spherical-pdf" | |
type V = SV.Vector Double | |
-- Use a regular 2.5 deg. x 2.5 deg. theta/phi grid on unit sphere. | |
ntheta, nphi :: Int | |
nphi = 2 * 360 `div` 5 ; ntheta = 2 * 180 `div` 5 | |
-- Integration steps. | |
dtheta, dphi :: Double | |
dphi = 2.0 * pi / fromIntegral nphi ; dtheta = pi / fromIntegral ntheta | |
main :: IO () | |
main = do | |
-- CUDA initialisation. | |
CUDA.initialise [] | |
dev <- CUDA.device 0 | |
ctx <- CUDA.create dev [] | |
ptx <- B.readFile "make_pdf.ptx" | |
CUDA.JITResult time log mdl <- CUDA.loadDataEx ptx [] | |
fun <- CUDA.getFun mdl "make_pdf_kernel" | |
-- Open projected points data file for input. | |
Right innc <- openFile $ workdir </> "sphere-proj.nc" | |
let Just ntime = ncDimLength <$> ncDim innc "time" | |
let (Just projvar) = ncVar innc "proj" | |
Right (HMatrix projsin) <- | |
getA innc projvar [0, 0] [ntime, 2] :: HMatrixRet CDouble | |
-- Convert projections to 3-D points, flatten to vector and allocate | |
-- on device. | |
let nData = rows projsin | |
dataPts = SV.concat $ map projToPt $ toRows $ cmap realToFrac projsin | |
dDataPts <- CUDA.mallocArray $ 3 * nData | |
SV.unsafeWith dataPts $ \p -> CUDA.pokeArray (3 * nData) p dDataPts | |
-- Calculate kernel values for each grid point/data point | |
-- combination and accumulate into grid. | |
dPdf <- CUDA.mallocArray $ ntheta * nphi :: IO (CUDA.DevicePtr Double) | |
let tx = 32 ; ty = 16 | |
blockSize = (tx, ty, 1) | |
gridSize = ((nphi - 1) `div` tx + 1, (ntheta - 1) `div` ty + 1, 1) | |
CUDA.launchKernel fun gridSize blockSize 0 Nothing | |
[CUDA.IArg (fromIntegral nData), CUDA.VArg dDataPts, CUDA.VArg dPdf] | |
CUDA.sync | |
res <- SVM.new (ntheta * nphi) | |
SVM.unsafeWith res $ \p -> CUDA.peekArray (ntheta * nphi) dPdf p | |
unnormv <- SV.unsafeFreeze res | |
let unnorm = reshape nphi unnormv | |
-- Normalise. | |
let ths = [(0.5 + fromIntegral i) * dtheta | i <- [0..ntheta-1] ] | |
phs = [fromIntegral i * dphi | i <- [0..nphi-1] ] | |
sinths = map sin ths | |
doone x v = sumElements $ cmap (x *) v | |
int = dtheta * dphi * sum (zipWith doone sinths (toRows unnorm)) | |
norm = cmap (realToFrac . (/ int)) unnorm :: Matrix CDouble | |
norm1 = cmap ((/ int)) unnorm | |
tst = dtheta * dphi * sum (zipWith doone sinths (toRows norm1)) | |
-- Set up output file. | |
let outthdim = NcDim "theta" ntheta False | |
outthvar = NcVar "theta" NcDouble [outthdim] M.empty | |
outphdim = NcDim "phi" nphi False | |
outphvar = NcVar "phi" NcDouble [outphdim] M.empty | |
outpdfvar = NcVar "pdf" NcDouble [outthdim, outphdim] M.empty | |
outncinfo = | |
emptyNcInfo (workdir </> "sphere-pdf.nc") # | |
addNcDim outthdim # addNcDim outphdim # | |
addNcVar outthvar # addNcVar outphvar # | |
addNcVar outpdfvar | |
flip (withCreateFile outncinfo) (putStrLn . ("ERROR: " ++) . show) $ | |
\outnc -> do | |
-- Write coordinate variable values. | |
put outnc outthvar $ | |
(SV.fromList $ map realToFrac ths :: SV.Vector CDouble) | |
put outnc outphvar $ | |
(SV.fromList $ map realToFrac phs :: SV.Vector CDouble) | |
put outnc outpdfvar $ HMatrix norm | |
return () | |
-- Convert (theta, phi) coordinates to 3-D unit vector. | |
projToPt :: Vector Double -> V | |
projToPt p = sphPt (p ! 0) (p ! 1) | |
-- Convert (theta, phi) coordinates to 3-D unit vector. | |
sphPt :: Double -> Double -> V | |
sphPt th ph = SV.fromList [sin th * cos ph, sin th * sin ph, cos th] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <cmath> | |
// Use a regular 2.5 deg. x 2.5 deg. theta/phi grid on unit sphere. | |
const int Nphi = 2 * 360 / 5, Ntheta = 2 * 180 / 5; | |
// Integration steps. | |
const double dphi = 2.0 * M_PI / Nphi, dtheta = M_PI / Ntheta; | |
// Density kernel bandwidth. | |
const double bandwidth = M_PI / 6.0; | |
extern "C" | |
__global__ void make_pdf_kernel | |
(unsigned int D, const double * __restrict__ d_data, double *d_pdf) | |
{ | |
unsigned int c = blockIdx.x * blockDim.x + threadIdx.x; | |
unsigned int r = blockIdx.y * blockDim.y + threadIdx.y; | |
double th = (0.5 + r) * dtheta, ph = c * dphi; | |
double gx = sin(th) * cos(ph), gy = sin(th) * sin(ph), gz = cos(th); | |
if (r > Ntheta || c > Nphi) return; | |
double sum = 0.0; | |
for (unsigned int i = 0; i < D; ++i) { | |
double dx = d_data[3 * i], dy = d_data[3 * i + 1], dz = d_data[3 * i + 2]; | |
double u = acos(dx * gx + dy * gy + dz * gz) / bandwidth; | |
if (u < 1) sum += 1 - u * u; | |
} | |
d_pdf[r * Nphi + c] = sum; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment