Skip to content

Instantly share code, notes, and snippets.

@piotrMocz
Created August 4, 2014 19:22
Show Gist options
  • Save piotrMocz/a4c2a1c117f504840070 to your computer and use it in GitHub Desktop.
Save piotrMocz/a4c2a1c117f504840070 to your computer and use it in GitHub Desktop.
This is more or less what despair looks like in code.
/**
* Bidiagonalization with removed calls to my own kernels, inlined functions for
* calculating the householder vectors and a few other minor changes.
*/
def bidiagDoubleInline(A: CuMatrix[Double])(implicit handle: cublasHandle): (CuMatrix[Double], CuMatrix[Double], CuMatrix[Double]) = {
if (A.rows < A.cols) {
println("Number of rows of matrix A cannot be smaller than the number of columns.")
return (null, A, null)
}
JCublas2.setExceptionsEnabled(true)
//JCuda.cudaDeviceReset()
//JCuda.cudaSetDeviceFlags(JCuda.cudaDeviceScheduleBlockingSync | JCuda.cudaDeviceMapHost)
val m = A.rows
val n = A.cols
var d_A = CuMatrix.create[Double](m, n); d_A := A
//val d_Q = CuMatrix.create[Double](m, m); eyeizeDouble(d_Q)
var d_Q = CuMatrix.eye[Double](m)
//val d_P = CuMatrix.create[Double](n, n); eyeizeDouble(d_P)
var d_P = CuMatrix.eye[Double](n)
val eyeDoubleM = CuMatrix.eye[Double](m)
// val oneArr = new Array[Double](1); oneArr(0) = 1.0 //Array(1.0)
// val one = jcuda.Pointer.to(oneArr)
// val zeroArr = new Array[Double](1); zeroArr(0) = 0.0 //Array(0.0)
// val zero = jcuda.Pointer.to(zeroArr)
// val minusOneArr = new Array[Double](1); minusOneArr(0) = -1.0 //Array(-1.0)
// val minusOne = jcuda.Pointer.to(minusOneArr)
val onePtr = Pointer.pointerToDouble(1.0)
val one = onePtr.toCuPointer
val zeroPtr = Pointer.pointerToDouble(0.0)
val zero = zeroPtr.toCuPointer
val minusOnePtr = Pointer.pointerToDouble(-1.0)
val minusOne = minusOnePtr.toCuPointer
val minusTwoPtr = Pointer.pointerToDouble(-2.0)
val minusTwo = minusTwoPtr.toCuPointer
val normArr = Pointer.pointerToDouble(0.0) //Array(0.0)
val v0Arr = Pointer.pointerToDouble(0.0) //Array(0.0)
val d_v = CuMatrix.create[Double](m, 1) // here we will store the householder vectors
val d_Q1 = CuMatrix.create[Double](m, m)
val d_P1 = CuMatrix.create[Double](n, n)
val steps = if (m == n) n-1 else n
var err = 0
JCuda.cudaDeviceSynchronize()
cfor(0)(_ < steps, _ + 1) { i => {
// eliminate a column:
//householderMatDouble(d_A, i, i, d_v, d_Q1, eyeDoubleM, col = true)
// ----------- calculate the householder matrix for the column ---------------------------------------
var len = m
var stride = 1
var offset = i
println("(((" + i + ", " + i + ")))")
// copy the vector from the matrix into d_v:
err = JCublas2.cublasDcopy(handle, len-offset, d_A.offsetPointer.withByteOffset(d_A.linearIndex(i, i) * d_A.elemSize),
stride, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- copy")
println("Copied: " + d_v)
// normArr(0) = |d_v|
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
1, normArr.toCuPointer)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- nrm")
println("Norm: " + normArr(0))
// v0Arr(0) = d_v(0)
err = JCuda.cudaMemcpy2D(v0Arr.toCuPointer, d_v.majorStride * d_v.elemSize,
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyDeviceToHost)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- memcpy")
println("v_0: " + v0Arr(0))
v0Arr(0) -= normArr(0)
println("v_0 --> new: " + v0Arr(0))
// put d_v(0) back in the vector
err = JCuda.cudaMemcpy2D(d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
d_v.majorStride * d_v.elemSize, v0Arr.toCuPointer,
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyHostToDevice)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- memcpy")
println("d_v after memcpy: " + d_v)
// calculate the norm of the updated vector v:
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
1, normArr.toCuPointer)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- nrm2")
println("New norm: " + normArr(0))
normArr(0) = 1.0 / normArr(0)
println("Newer norm: " + normArr(0))
// d_v = d_v / |d_v|
err = JCublas2.cublasDscal(handle, len-offset, normArr.toCuPointer,
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- scal")
println("d_v after scal: " + d_v)
// make sure d_Q1 is an identity matrix:
d_Q1 := eyeDoubleM //eyeizeDouble(d_Q1)
// d_Q1 = I - 2*d_v*d_v' (d_Q == I)
err = DgemmNT(len-offset, len-offset, 1, minusTwo, d_v, offset, 0, d_v, offset, 0,
one, d_Q1, offset, offset)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- dgemm (I - 2*d_v*d_v.t)")
println("d_Q1: " + d_Q1)
println()
// ----------------------------------------------------------------------------------
//d_A = d_Q1 * d_A
//err = DgemmNN(m-i, n-i, m-i, one, d_Q1, i, i, d_A, i, i, zero, d_A, i, i)
//err = DgemmNN(m, n, m, one, d_Q1, 0, 0, d_A, 0, 0, zero, d_A, 0, 0)
d_A = d_Q1 * d_A
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] DgemmNN (d_Q1 * d_A)")
println("After elimination (col):\n" + d_A)
// d_Q = d_Q * d_Q1
//err = DgemmNN(m, m, m, one, d_Q, 0, 0, d_Q1, 0, 0, zero, d_Q, 0, 0)
d_Q = d_Q * d_Q1
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] DgemmNN (d_Q * d_Q1)")
if (i < n - 2) {
// eliminate a row:
//householderMatDouble(d_A, i, i + 1, d_v, d_P1, eyeDoubleM, col = false)
// ----------- calculate the householder matrix for the row ---------------------------------------
len = n
stride = d_A.majorStride
offset = i+1; println("Offset: " + offset)
err = 0
println("(((" + i + ", " + (i+1).toString + ")))")
// copy the vector from the matrix into d_v:
err = JCublas2.cublasDcopy(handle, len-offset, d_A.offsetPointer.withByteOffset(d_A.linearIndex(i, i+1) * d_A.elemSize),
stride, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- copy")
println("Copied: " + d_v)
// normArr(0) = |d_v|
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
1, normArr.toCuPointer)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- nrm")
println("Norm: " + normArr(0))
// v0Arr(0) = d_v(0)
err = JCuda.cudaMemcpy2D(v0Arr.toCuPointer, d_v.majorStride * d_v.elemSize,
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyDeviceToHost)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- memcpy")
println("v_0: " + v0Arr(0))
v0Arr(0) -= normArr(0)
println("v_0 --> new: " + v0Arr(0))
// put d_v(0) back in the vector
err = JCuda.cudaMemcpy2D(d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
d_v.majorStride * d_v.elemSize, v0Arr.toCuPointer,
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyHostToDevice)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- memcpy")
println("d_v after memcpy: " + d_v)
// calculate the norm of the updated vector v:
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize),
1, normArr.toCuPointer)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- nrm2")
println("New norm: " + normArr(0))
normArr(0) = 1.0 / normArr(0)
println("Newer norm: " + normArr(0))
// d_v = d_v / |d_v|
err = JCublas2.cublasDscal(handle, len-offset, normArr.toCuPointer,
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- scal")
println("d_v after scal: " + d_v)
// make sure d_Q1 is an identity matrix:
d_P1 := eyeDoubleM //eyeizeDouble(d_Q1)
// d_Q1 = I - 2*d_v*d_v' (d_Q == I)
err = DgemmNT(len-offset, len-offset, 1, minusTwo, d_v, offset, 0, d_v, offset, 0,
one, d_P1, offset, offset)
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] householderMat -- dgemm (I - 2*d_v*d_v.t)")
println("d_P1: " + d_P1)
println()
// ----------------------------------------------------------------------------------
// d_A = d_A * d_P1
//err = DgemmNN(m, n, n, one, d_A, 0, 0, d_P1, 0, 0, zero, d_A, 0, 0)
//err = DgemmNN(m-i, n-i-1, n-i-1, one, d_A, i, i+1, d_P1, i+1, i+1, zero, d_A, i, i+1)
d_A = d_A * d_P1
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] DgemmNN (d_A * d_P1)")
println("After elimination (row):\n" + d_A)
// d_P = d_P1 * d_P
//err = DgemmNN(n, n, n, one, d_P1, 0, 0, d_P, 0, 0, zero, d_P, 0, 0)
d_P = d_P1 * d_P
JCuda.cudaDeviceSynchronize()
if (err != 0) println("[ERROR] DgemmNN (d_P1 * d_P)")
}
}}
// make d_A explicitly bidiagonal (there're some close-to-zero values due to roundoff errors)
//zeroOutDoubleOffset(d_A, 0, 1, 'U')
//zeroOutDouble(d_A, 'L')
(d_Q, d_A, d_P)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment