Created
August 4, 2014 19:22
-
-
Save piotrMocz/a4c2a1c117f504840070 to your computer and use it in GitHub Desktop.
This is more or less what despair looks like in code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Bidiagonalization with removed calls to my own kernels, inlined functions for | |
* calculating the householder vectors and a few other minor changes. | |
*/ | |
def bidiagDoubleInline(A: CuMatrix[Double])(implicit handle: cublasHandle): (CuMatrix[Double], CuMatrix[Double], CuMatrix[Double]) = { | |
if (A.rows < A.cols) { | |
println("Number of rows of matrix A cannot be smaller than the number of columns.") | |
return (null, A, null) | |
} | |
JCublas2.setExceptionsEnabled(true) | |
//JCuda.cudaDeviceReset() | |
//JCuda.cudaSetDeviceFlags(JCuda.cudaDeviceScheduleBlockingSync | JCuda.cudaDeviceMapHost) | |
val m = A.rows | |
val n = A.cols | |
var d_A = CuMatrix.create[Double](m, n); d_A := A | |
//val d_Q = CuMatrix.create[Double](m, m); eyeizeDouble(d_Q) | |
var d_Q = CuMatrix.eye[Double](m) | |
//val d_P = CuMatrix.create[Double](n, n); eyeizeDouble(d_P) | |
var d_P = CuMatrix.eye[Double](n) | |
val eyeDoubleM = CuMatrix.eye[Double](m) | |
// val oneArr = new Array[Double](1); oneArr(0) = 1.0 //Array(1.0) | |
// val one = jcuda.Pointer.to(oneArr) | |
// val zeroArr = new Array[Double](1); zeroArr(0) = 0.0 //Array(0.0) | |
// val zero = jcuda.Pointer.to(zeroArr) | |
// val minusOneArr = new Array[Double](1); minusOneArr(0) = -1.0 //Array(-1.0) | |
// val minusOne = jcuda.Pointer.to(minusOneArr) | |
val onePtr = Pointer.pointerToDouble(1.0) | |
val one = onePtr.toCuPointer | |
val zeroPtr = Pointer.pointerToDouble(0.0) | |
val zero = zeroPtr.toCuPointer | |
val minusOnePtr = Pointer.pointerToDouble(-1.0) | |
val minusOne = minusOnePtr.toCuPointer | |
val minusTwoPtr = Pointer.pointerToDouble(-2.0) | |
val minusTwo = minusTwoPtr.toCuPointer | |
val normArr = Pointer.pointerToDouble(0.0) //Array(0.0) | |
val v0Arr = Pointer.pointerToDouble(0.0) //Array(0.0) | |
val d_v = CuMatrix.create[Double](m, 1) // here we will store the householder vectors | |
val d_Q1 = CuMatrix.create[Double](m, m) | |
val d_P1 = CuMatrix.create[Double](n, n) | |
val steps = if (m == n) n-1 else n | |
var err = 0 | |
JCuda.cudaDeviceSynchronize() | |
cfor(0)(_ < steps, _ + 1) { i => { | |
// eliminate a column: | |
//householderMatDouble(d_A, i, i, d_v, d_Q1, eyeDoubleM, col = true) | |
// ----------- calculate the householder matrix for the column --------------------------------------- | |
var len = m | |
var stride = 1 | |
var offset = i | |
println("(((" + i + ", " + i + ")))") | |
// copy the vector from the matrix into d_v: | |
err = JCublas2.cublasDcopy(handle, len-offset, d_A.offsetPointer.withByteOffset(d_A.linearIndex(i, i) * d_A.elemSize), | |
stride, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- copy") | |
println("Copied: " + d_v) | |
// normArr(0) = |d_v| | |
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
1, normArr.toCuPointer) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- nrm") | |
println("Norm: " + normArr(0)) | |
// v0Arr(0) = d_v(0) | |
err = JCuda.cudaMemcpy2D(v0Arr.toCuPointer, d_v.majorStride * d_v.elemSize, | |
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyDeviceToHost) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- memcpy") | |
println("v_0: " + v0Arr(0)) | |
v0Arr(0) -= normArr(0) | |
println("v_0 --> new: " + v0Arr(0)) | |
// put d_v(0) back in the vector | |
err = JCuda.cudaMemcpy2D(d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
d_v.majorStride * d_v.elemSize, v0Arr.toCuPointer, | |
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyHostToDevice) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- memcpy") | |
println("d_v after memcpy: " + d_v) | |
// calculate the norm of the updated vector v: | |
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
1, normArr.toCuPointer) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- nrm2") | |
println("New norm: " + normArr(0)) | |
normArr(0) = 1.0 / normArr(0) | |
println("Newer norm: " + normArr(0)) | |
// d_v = d_v / |d_v| | |
err = JCublas2.cublasDscal(handle, len-offset, normArr.toCuPointer, | |
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- scal") | |
println("d_v after scal: " + d_v) | |
// make sure d_Q1 is an identity matrix: | |
d_Q1 := eyeDoubleM //eyeizeDouble(d_Q1) | |
// d_Q1 = I - 2*d_v*d_v' (d_Q == I) | |
err = DgemmNT(len-offset, len-offset, 1, minusTwo, d_v, offset, 0, d_v, offset, 0, | |
one, d_Q1, offset, offset) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- dgemm (I - 2*d_v*d_v.t)") | |
println("d_Q1: " + d_Q1) | |
println() | |
// ---------------------------------------------------------------------------------- | |
//d_A = d_Q1 * d_A | |
//err = DgemmNN(m-i, n-i, m-i, one, d_Q1, i, i, d_A, i, i, zero, d_A, i, i) | |
//err = DgemmNN(m, n, m, one, d_Q1, 0, 0, d_A, 0, 0, zero, d_A, 0, 0) | |
d_A = d_Q1 * d_A | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] DgemmNN (d_Q1 * d_A)") | |
println("After elimination (col):\n" + d_A) | |
// d_Q = d_Q * d_Q1 | |
//err = DgemmNN(m, m, m, one, d_Q, 0, 0, d_Q1, 0, 0, zero, d_Q, 0, 0) | |
d_Q = d_Q * d_Q1 | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] DgemmNN (d_Q * d_Q1)") | |
if (i < n - 2) { | |
// eliminate a row: | |
//householderMatDouble(d_A, i, i + 1, d_v, d_P1, eyeDoubleM, col = false) | |
// ----------- calculate the householder matrix for the row --------------------------------------- | |
len = n | |
stride = d_A.majorStride | |
offset = i+1; println("Offset: " + offset) | |
err = 0 | |
println("(((" + i + ", " + (i+1).toString + ")))") | |
// copy the vector from the matrix into d_v: | |
err = JCublas2.cublasDcopy(handle, len-offset, d_A.offsetPointer.withByteOffset(d_A.linearIndex(i, i+1) * d_A.elemSize), | |
stride, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- copy") | |
println("Copied: " + d_v) | |
// normArr(0) = |d_v| | |
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
1, normArr.toCuPointer) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- nrm") | |
println("Norm: " + normArr(0)) | |
// v0Arr(0) = d_v(0) | |
err = JCuda.cudaMemcpy2D(v0Arr.toCuPointer, d_v.majorStride * d_v.elemSize, | |
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyDeviceToHost) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- memcpy") | |
println("v_0: " + v0Arr(0)) | |
v0Arr(0) -= normArr(0) | |
println("v_0 --> new: " + v0Arr(0)) | |
// put d_v(0) back in the vector | |
err = JCuda.cudaMemcpy2D(d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
d_v.majorStride * d_v.elemSize, v0Arr.toCuPointer, | |
d_v.majorStride * d_v.elemSize, d_v.elemSize, 1, cudaMemcpyKind.cudaMemcpyHostToDevice) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- memcpy") | |
println("d_v after memcpy: " + d_v) | |
// calculate the norm of the updated vector v: | |
err = JCublas2.cublasDnrm2(handle, len-offset, d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), | |
1, normArr.toCuPointer) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- nrm2") | |
println("New norm: " + normArr(0)) | |
normArr(0) = 1.0 / normArr(0) | |
println("Newer norm: " + normArr(0)) | |
// d_v = d_v / |d_v| | |
err = JCublas2.cublasDscal(handle, len-offset, normArr.toCuPointer, | |
d_v.offsetPointer.withByteOffset(d_v.linearIndex(offset, 0) * d_v.elemSize), 1) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- scal") | |
println("d_v after scal: " + d_v) | |
// make sure d_Q1 is an identity matrix: | |
d_P1 := eyeDoubleM //eyeizeDouble(d_Q1) | |
// d_Q1 = I - 2*d_v*d_v' (d_Q == I) | |
err = DgemmNT(len-offset, len-offset, 1, minusTwo, d_v, offset, 0, d_v, offset, 0, | |
one, d_P1, offset, offset) | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] householderMat -- dgemm (I - 2*d_v*d_v.t)") | |
println("d_P1: " + d_P1) | |
println() | |
// ---------------------------------------------------------------------------------- | |
// d_A = d_A * d_P1 | |
//err = DgemmNN(m, n, n, one, d_A, 0, 0, d_P1, 0, 0, zero, d_A, 0, 0) | |
//err = DgemmNN(m-i, n-i-1, n-i-1, one, d_A, i, i+1, d_P1, i+1, i+1, zero, d_A, i, i+1) | |
d_A = d_A * d_P1 | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] DgemmNN (d_A * d_P1)") | |
println("After elimination (row):\n" + d_A) | |
// d_P = d_P1 * d_P | |
//err = DgemmNN(n, n, n, one, d_P1, 0, 0, d_P, 0, 0, zero, d_P, 0, 0) | |
d_P = d_P1 * d_P | |
JCuda.cudaDeviceSynchronize() | |
if (err != 0) println("[ERROR] DgemmNN (d_P1 * d_P)") | |
} | |
}} | |
// make d_A explicitly bidiagonal (there're some close-to-zero values due to roundoff errors) | |
//zeroOutDoubleOffset(d_A, 0, 1, 'U') | |
//zeroOutDouble(d_A, 'L') | |
(d_Q, d_A, d_P) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment