Skip to content

Instantly share code, notes, and snippets.

@jasonbaldridge
Last active August 29, 2015 14:00
Show Gist options
  • Save jasonbaldridge/11089963 to your computer and use it in GitHub Desktop.
Save jasonbaldridge/11089963 to your computer and use it in GitHub Desktop.
/**
* R functions to create and manipulate Breeze matrices. This should go into
* Breeze or a sub-project of Breeze eventually.
*/
object RFunc {
import breeze.linalg._
import breeze.stats.distributions._
import breeze.stats.DescriptiveStats._
/**
* Produces sample quantiles corresponding to the given probabilities.
*/
def quantile(
values: TraversableOnce[Double],
probs: Seq[Double] = Seq(0.0, 0.25,.5,.75,1.0)
) = {
// Not efficient, but maximally reuses existing code and is a fine
// start.
probs.map(p=>percentile(values,p))
}
/**
* Create a matrix of Doubles given a Seq of Double Seqs.
*/
def matrix(values: Seq[Seq[Double]]): DenseMatrix[Double] =
matrix[Double](values.flatten.toArray, values.length, values.head.length)
/**
* Create a matrix of Strings given a Seq of String Seqs. This should
* be one method with the above for Doubles, but there was some type
* compile error and I couldn't be bothered to sort it out. I.e., we
* want matrix[T](values: Seq[Seq[T]]): DenseMatrix[T].
*/
def matrixString(values: Seq[Seq[String]]): DenseMatrix[String] =
matrix[String](values.flatten.toArray, values.length, values.head.length)
/**
* Create a Breeze matrix of the given proportions.
*/
def matrix[T](values: Array[T], nrow: Int, ncol: Int): DenseMatrix[T] =
new DenseMatrix[T](nrow, ncol, values,0,ncol,true)
/**
* Create a Breeze matrix initialized with the given value for all positions.
*/
def matrix(value: Double, nrow: Int, ncol: Int): DenseMatrix[Double] =
if (value == 0)
DenseMatrix.zeros[Double](nrow,ncol)
else
matrix(Array.fill(nrow*ncol)(value), nrow, ncol)
/**
* Create a vector repeating the given value.
*/
def rep(value: Double, length: Int) =
DenseVector.fill(length)(value)
/**
* Show the dimensions of the matrix.
*/
def dim[T](mat: DenseMatrix[T]) = (mat.rows, mat.cols)
/**
* This is an impoverished implementation of R's model.matrix function.
* It's also not particularly efficient.
*/
def modelMatrix(columnNames: Seq[String], data: DenseMatrix[String]) = {
val info = (for (j <- 0 until data.cols) yield {
val name = columnNames(j)
val values = data(::,j).toDenseVector.toArray.toSet
values.toSeq.map(v=>name+"_"+v)
}).flatten.toIndexedSeq
val infoMap = info.zipWithIndex.toMap
val infoLength = info.length
val binarizedMatrix = (for (i <- 0 until data.rows) yield {
val demoBinary = Array.fill(info.length)(0.0)
for (j <- 0 until data.cols) {
val jstring = columnNames(j)+"_"+data(i,j)
demoBinary(infoMap(jstring)) = 1.0
}
demoBinary.toSeq
}).toSeq
(info,matrix(binarizedMatrix))
}
/**
* Draw values from a Gaussian with given mean and standard deviation.
*/
def rnorm(size: Int, mean: Double = 0.0, stdev: Double = 1.0) =
new Gaussian(mean,stdev).sample(size).toArray
/**
* Get the max of the given value and the value at each position in the
* given vector.
*/
def pmax(value: Double, vec: DenseVector[Double]) =
vec.map(x => math.max(value,x))
/**
* Get the absolute value of every value in the given vector.
*/
def abs(vec: DenseVector[Double]) =
vec.map(math.abs)
}
// This is code I did a while ago that goes further in allowing R operators and functions to be used with
// Breeze, but it was for Breeze version over one year ago (when broadcasting wasn't supported, which would
// have simplified some of the implementations). Would be nice to fold some of this in to the RFunc object above,
// and have tests and such.
object BreezeMatrixUtil {
import breeze.linalg.{DenseMatrix,DenseVector,LinearAlgebra}
implicit def breezeMatrixToRMatrix(mat: DenseMatrix[Double]) = new RMatrix(mat)
class RMatrix (mat: DenseMatrix[Double]) {
def %*% (that: DenseMatrix[Double]) = mat * that
def * (multiplier: Double) = mat.map(_*multiplier)
//def * (that: DenseMatrix[Double]) = {
// val result = DenseMatrix.zeros[Double](mat.rows,mat.cols)
// for (r <- 0 until mat.rows; c <- 0 until mat.cols)
// result(r,c) = mat(r,c) * that(r,c)
// result
//}
def multiplyEachColumnByVector (that: DenseVector[Double]) = {
val result = DenseMatrix.zeros[Double](mat.rows,mat.cols)
for (r <- 0 until mat.rows; c <- 0 until mat.cols)
result(r,c) = mat(r,c) * that(r)
result
}
def divideElements (divider: Double) = mat.map(_/divider)
def col(id: Int) = mat(::, id)
def row(id: Int): DenseVector[Double] = mat.t.apply(::, id)
def set(row: Int, col: Int, value: Double) { mat(row,col) = value }
}
implicit def breezeVectorToRVector(vec: DenseVector[Double]) = new RVector(vec)
class RVector (vec: DenseVector[Double]) {
def / (value: Double) = vec.map(_/value)
}
implicit def doubleToRDouble(x: Double) = new RDouble(x)
class RDouble (x: Double) {
def * (mat: DenseMatrix[Double]) = mat * x
def * (vec: DenseVector[Double]) = vec * x
def / (mat: DenseMatrix[Double]) = mat.map(v => if (v == 0.0) 0.0 else x/v)
def / (vec: DenseVector[Double]) = vec.map(v => if (v == 0.0) 0.0 else x/v)
def + (vec: DenseVector[Double]) = vec.map(v => x+v)
}
def multVecMat(vec: DenseVector[Double], mat: DenseMatrix[Double]) = {
val result = DenseMatrix.zeros[Double](mat.rows,mat.cols)
for (r <- 0 until mat.rows; c <- 0 until mat.cols)
result(r,c) = mat(r,c) * vec(r)
result
}
def solve(mat: DenseMatrix[Double]) = LinearAlgebra.inv(mat)
def t(mat: DenseMatrix[Double]) = mat.t
def cbind(mat: DenseMatrix[Double], newColumnValue: Double) =
DenseMatrix.horzcat(mat, DenseMatrix.fill[Double](mat.rows,1)(newColumnValue))
def cbind(mat: DenseMatrix[Double], vec: DenseVector[Double]) =
DenseMatrix.horzcat(mat, vec.t.t)
def rbind(mat: DenseMatrix[Double], newRowValue: Double) =
DenseMatrix.vertcat(mat, DenseMatrix.fill[Double](1,mat.cols)(newRowValue))
def outer(vec1: DenseVector[Double], vec2: DenseMatrix[Double]) = {
assert(vec2.rows == 1)
val result = DenseMatrix.zeros[Double](vec1.length,vec2.cols)
for (r <- 0 until vec1.length;
v1val = vec1(r);
c <- 0 until vec2.cols)
result(r,c) = v1val * vec2(0,c)
result
}
def columnAbsSums (mat: DenseMatrix[Double], columnAddValue: Double = 0.0) = {
val sums = (0 until mat.cols).map { j =>
mat(::,j).map(math.abs).sum + 2
}.toArray
DenseVector(sums)
}
def scan(filename: String) = {
val values = io.Source.fromFile(filename).getLines.map(_.toDouble)
new DenseVector(values.toArray)
}
def readTableAsMatrix(filename: String) = {
val values = io.Source.fromFile(filename).getLines.map { line =>
line.split(" ").map(_.toDouble).toArray
}.toArray
createMatrixFromArray(values)
}
def readTableAsMatrix(lines: Iterator[String], blockSize: Int) = {
val values = lines.take(blockSize).map { line =>
line.split(" ").map(_.toDouble).toArray
}.toArray
createMatrixFromArray(values)
}
def createMatrixFromArray(values: Array[Array[Double]]) =
DenseMatrix.create[Double](values.head.length, values.length, values.flatten).t
def array(value: Double, numRows: Int, numCols: Int, numRepetitions: Int) =
(1 to numRepetitions).map(i => matrix(value, numRows, numCols)).toArray
def array(matrix: DenseMatrix[Double],
numRows: Int, numCols: Int, numRepetitions: Int) =
(1 to numRepetitions).map(i => matrix).toArray
def matrix(numRows: Int, numCols: Int): DenseMatrix[Double] =
DenseMatrix.zeros[Double](numRows, numCols)
def matrix(numRows: Int, numCols: Int, values: Array[Double]): DenseMatrix[Double] =
DenseMatrix.create[Double](numRows, numCols, values)
def matrix(value: Double, numRows: Int, numCols: Int): DenseMatrix[Double] =
if (value == 0.0) DenseMatrix.zeros[Double](numRows, numCols)
else DenseMatrix.fill[Double](numRows,numCols)(value)
def matrix(values: Seq[Double], numRows: Int, numCols: Int): DenseMatrix[Double] =
DenseMatrix.create[Double](numRows,numCols, values.toArray)
def rep(value: Double, size: Int) = DenseVector.fill(size)(value)
def diag(value: Double, size: Int): DenseMatrix[Double] = diag(rep(value, size))
def diag(vec: DenseVector[Double]) = breeze.linalg.diag(vec)
// WARNING: this modifies the input matrix
def setDiagonal(value: Double, mat: DenseMatrix[Double]) = {
(0 until mat.cols).foreach(index => mat(index, index) = value)
mat
}
// do this slowly for now
def tcrossprod(mat: DenseMatrix[Double]): DenseMatrix[Double] = tcrossprod(mat, mat)
def tcrossprod(matA: DenseMatrix[Double], matB: DenseMatrix[Double]): DenseMatrix[Double] =
matA * matB.t
def tcrossprod(vec: DenseVector[Double]) = vec * vec.t
def mean(matrices: Array[DenseMatrix[Double]]) =
matrices.reduce(_ :+ _).map(_/matrices.length)
def weightedMean(matrices: Array[DenseMatrix[Double]], weights: Array[Double]) =
matrices.zip(weights)
.map { case(mat,mult) => mat*mult }
.reduce(_ :+ _)
.map(_/weights.sum)
def submatrix(mat: DenseMatrix[Double], rows: Seq[Int], cols: Seq[Int]) = {
val data = for (i <- rows; j <- cols) yield mat(i,j)
new DenseMatrix(rows.length, data.toArray)
}
}
// Finally, here is code to allow R functions to be used to create and manipulate Colt Matrices. There
// is more that could be brought into RFunc (though substituting in Breeze objects), and perhaps it
// would be useful for anyone who wants to work with Colt instead of Breeze.
object ColtMatrixUtil {
import tempest.data._
import cern.colt.matrix.tdouble.{DoubleMatrix1D=>DoubleVector,DoubleMatrix2D=>DoubleMatrix}
import cern.colt.matrix.tdouble.impl.{DenseDoubleMatrix1D,DenseDoubleMatrix2D}
import cern.colt.matrix.tdouble.algo._
import cern.colt.function.tdouble._
lazy val linalg = new DenseDoubleAlgebra
lazy val util1D = cern.colt.matrix.tdouble.DoubleFactory1D.dense
lazy val util2D = cern.colt.matrix.tdouble.DoubleFactory2D.dense
implicit def coltMatrixToRMatrix(mat: DoubleMatrix) = new RMatrix(mat)
implicit def coltVectorToRVector(vec: DoubleVector) = new RVector(vec)
implicit def doubleToRDouble(x: Double) = new RDouble(x)
implicit def colt1Dto2D(vec: DoubleVector) = util2D.make(vec.toArray, vec.size.toInt)
//implicit def arrayTo1D(a: Array[Double]) = util1D.make(a)
type ConcatType = ::.type
//implicit def concatToEmptyRange(x: ::.type) = 0 to -1
//implicit def intToRange(x: Int) = x to x
class MultFnc(multiplier: Double) extends IntIntDoubleFunction {
override def apply(r: Int, c: Int, d: Double) = d*multiplier
}
class VecMultFnc(multiplier: Double) extends DoubleFunction {
override def apply(d: Double) = d*multiplier
}
class DivideByElementsFnc(numerator: Double) extends IntIntDoubleFunction {
override def apply(r: Int, c: Int, d: Double) = numerator/d
}
object PairwiseDivideFnc extends DoubleDoubleFunction {
override def apply(x: Double, y: Double) = x/y
}
class VecDivideByElementsFnc(numerator: Double) extends DoubleFunction {
override def apply(d: Double) = numerator/d
}
class AddFnc(value: Double) extends DoubleFunction {
override def apply(d: Double) = d + value
}
object SqrtFnc extends DoubleFunction {
override def apply(d: Double) = math.sqrt(d)
}
object TanhFnc extends DoubleFunction {
override def apply(d: Double) = math.tanh(d)
}
class MatrixAddFnc extends DoubleDoubleFunction {
override def apply(x: Double, y: Double) = x+y
}
class RMatrix (mat: DoubleMatrix) {
def %*% (that: DoubleMatrix) = mat.zMult(that, null)
def %*% (that: DoubleVector) = that %*% t(mat)
def * (value: Double) = mat.copy.forEachNonZero(new MultFnc(value))
def * (vec: DoubleVector) = mat.copy.multiplyEachColumnByVector(vec)
def / (value: Double) = mat.copy.forEachNonZero(new MultFnc(1/value))
def + (that: DoubleMatrix) = {
val result = mat.like
for (r <- 0 until mat.rows; c <- 0 until mat.columns)
result.setQuick(r,c, mat.getQuick(r,c) + that.getQuick(r,c))
result
}
def - (that: DoubleMatrix) = {
val result = mat.like
for (r <- 0 until mat.rows; c <- 0 until mat.columns)
result.setQuick(r,c, mat.getQuick(r,c) - that.getQuick(r,c))
result
}
def cols = mat.columns
def multiplyEachColumnByVector (that: DoubleVector) = {
val result = mat.like
for (r <- 0 until mat.rows; c <- 0 until mat.columns)
result.setQuick(r,c, mat.getQuick(r,c) * that.getQuick(r))
result
}
def divideElements (divider: Double) = mat.copy.forEachNonZero(new MultFnc(1/divider))
def getRow(i: Int) = mat.viewRow(i).copy
def getRows(rowRange: Range) =
mat.viewSelection(rowRange.toArray, (0 until mat.columns).toArray).copy
def getCol(i: Int) = mat.viewColumn(i).copy
def getCols(colRange: Range) =
mat.viewSelection((0 until mat.rows).toArray, colRange.toArray).copy
def apply(rowR: Range, colR: Range) =
mat.viewSelection(rowR.toArray, colR.toArray).copy
def update(r: Int, x: ConcatType, vec: DoubleVector) =
for (c <- 0 until mat.columns) mat.setQuick(r, c, vec.getQuick(c))
def update(x: ConcatType, c: Int, vec: DoubleVector) =
for (r <- 0 until mat.rows) mat.setQuick(r, c, vec.getQuick(r))
}
class RVector (vec: DoubleVector) {
def %*% (that: DoubleVector) = linalg.mult(vec,that)
def %*% (that: DoubleMatrix) =
util1D.make((0 until that.columns).map(c => linalg.mult(vec,that.viewColumn(c))).toArray)
def * (value: Double) = vec.copy.assign(new VecMultFnc(value))
def * (that: DoubleVector) =
util1D.make(vec.toArray.zip(that.toArray).map(x=>x._1*x._2))
def / (value: Double) = vec.copy.assign(new VecMultFnc(1/value))
def / (that: DoubleVector) = vec.copy.assign(that, PairwiseDivideFnc)
def + (that: DoubleVector) = {
val result = vec.like
for (r <- 0 until vec.size.toInt)
result.setQuick(r, vec.getQuick(r) + that.getQuick(r))
result
}
def - (value: Double) = vec.copy.assign(new AddFnc(-value))
def - (that: DoubleVector) = {
val result = vec.like
for (r <- 0 until vec.size.toInt)
result.setQuick(r, vec.getQuick(r) - that.getQuick(r))
result
}
}
class RDouble (x: Double) {
def * (mat: DoubleMatrix) = mat * x
def * (vec: DoubleVector) = vec * x
def / (mat: DoubleMatrix) = mat.copy.forEachNonZero(new DivideByElementsFnc(x))
def / (vec: DoubleVector) = vec.copy.assign(new VecDivideByElementsFnc(x))
def + (vec: DoubleVector) = vec.copy.assign(new AddFnc(x))
}
def multVecMat(vec: DoubleVector, mat: DoubleMatrix) = mat.multiplyEachColumnByVector(vec)
def solve(mat: DoubleMatrix) = linalg.inverse(mat)
def t(mat: DoubleMatrix) = linalg.transpose(mat)
def cbind(mat: DoubleMatrix, newColumnValue: Double) =
util2D.appendColumn(mat, util1D.make(Array.fill(mat.rows)(newColumnValue)))
def cbind(mat: DoubleMatrix, vec: DoubleVector) =
util2D.appendColumn(mat, vec)
def rbind(mat: DoubleMatrix, newRowValue: Double) =
util2D.appendRow(mat, util1D.make(Array.fill(mat.columns)(newRowValue)))
def rbind(upper: DoubleMatrix, lower: DoubleMatrix) =
util2D.appendRows(upper, lower)
def outer(vec1: DoubleVector, vec2: DoubleVector) =
linalg.xmultOuter(vec1,vec2)
def scan(filename: String) = {
val values = io.Source.fromFile(filename).getLines.map(_.toDouble)
util1D.make(values.toArray)
}
def readTableAsMatrix(filename: String) = {
val values = io.Source.fromFile(filename).getLines.map { line =>
line.split(" ").map(_.toDouble).toArray
}.toArray
createMatrixFromArray(values)
}
def readTableAsMatrix(lines: Iterator[String], blockSize: Int) = {
val values = lines.take(blockSize).map { line =>
line.split(" ").map(_.toDouble).toArray
}.toArray
createMatrixFromArray(values)
}
def readTableAsMatrix(lines: DocumentSource, blockSize: Int) =
createMatrixFromArray(lines.take(blockSize).toArray)
def createMatrixFromArray(values: Array[Array[Double]]) = util2D.make(values)
def array(value: Double, numRows: Int, numCols: Int, numRepetitions: Int) = {
val mat = util2D.make(numRows, numCols,value)
(1 to numRepetitions).map(i => mat.copy).toArray
}
def array(mat: DoubleMatrix, numRows: Int, numCols: Int, numRepetitions: Int) =
(1 to numRepetitions).map(i => mat.copy).toArray
def matrix(numRows: Int, numCols: Int): DoubleMatrix =
util2D.make(numRows, numCols)
def matrix(numRows: Int, numCols: Int, values: Array[Double]): DoubleMatrix =
util2D.make(values, numRows)
def matrix(value: Double, numRows: Int, numCols: Int): DoubleMatrix =
util2D.make(numRows, numCols, value)
def matrix(values: Seq[Double], numRows: Int, numCols: Int): DoubleMatrix =
util2D.make(values.toArray, numRows)
def vector(values: Array[Double]) = util1D.make(values)
def rep(value: Double, size: Int) = util1D.make(size, value)
def diag(value: Double, size: Int): DoubleMatrix = diag(rep(value, size))
def diag(vec: DoubleVector) = util2D.diagonal(vec)
def diag(mat: DoubleMatrix) = util2D.diagonal(mat)
// do this slowly for now
def tcrossprod(mat: DoubleMatrix): DoubleMatrix = tcrossprod(mat, mat)
def tcrossprod(matA: DoubleMatrix, matB: DoubleMatrix): DoubleMatrix =
linalg.mult(matA, t(matB))
def tcrossprod(vec: DoubleVector) = linalg.xmultOuter(vec,vec)
def mean(matrices: Array[DoubleMatrix]) = {
val zeros = matrix(matrices.head.rows,matrices.head.columns)
val added =
matrices.foldLeft(zeros)((accum, next) => accum.assign(next, new MatrixAddFnc))
added / matrices.length
}
def weightedMean(matrices: Array[DoubleMatrix], weights: Array[Double]) = {
val zeros = matrix(matrices.head.rows,matrices.head.columns)
val added =
matrices.zip(weights)
.map { case(mat,mult) => mat*mult }
.foldLeft(zeros)((accum, next) => accum.assign(next, new MatrixAddFnc))
added / weights.sum
}
def submatrix(mat: DoubleMatrix, rows: Seq[Int], cols: Seq[Int]) = {
val data = for (i <- rows; j <- cols) yield mat.getQuick(i,j)
util2D.make(data.toArray,rows.length)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment