Create a gist now

Instantly share code, notes, and snippets.

@kmader / Secret
Last active Aug 29, 2015

Hyperspectral Analysis in Spark

Hyperspectral Data

The data for these experiments are taken from a Raman Spectral Imaging project conducted with Dr. Thomas Huser at the Center for Biophotonics Science and Technology at UCDavis.

// read in the data
val datapath = "/Volumes/WORKDISK/WorkData/Hyperspectral/raw/"
val rawdata = sc.textFile(datapath+"*.csv")
// get the first line
// parse the data into columns
// convert to values
// make a nice case class
case class spec_point(x: Double, y:Double, z:Double, wavenumber: Double, intensity: Double) {
def getPos() = (x,y,z)
def getVal() = (wavenumber,intensity)
// wrap the data in the case class
val specdata = => spec_point(pt(0),pt(1),pt(2),pt(3),pt(4)))
val ptspec = => (spt.getPos,spt))
// Now for the spark sql code
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
import sqlContext._
// register the data
// run a simple sql command
val avgInt = sql("SELECT AVG(intensity) FROM Spectra")
// save as parquet
// starting from the parquet file directly
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
import sqlContext._
val pqspec = sqlContext.parquetFile("spec.pqt")
// get an average image with SparkSQL
val avgimg = sql("SELECT x,y,z,AVG(intensity) FROM PQSpectra GROUP BY ((z+y*1000)*1000+x)")
val imgarr = avgimg.collect
// get an average image normally
val ptspec =[Double])).map( spt => ((spt(0),spt(1),spt(2)),(spt(3),spt(4))) )
// organize it as an image
val imgData = ptspec.groupByKey
val nAvgImg = imgData.mapValues(cspec =>
// sorting command for spectra
import scala.util.Sorting.stableSort
// organize it as an image
val imgData = ptspec.groupByKey
val imgSpec = imgData.mapValues {
rawspec =>
val outspec = rawspec.toArray
stableSort(outspec,(e1: spec_point,e2: spec_point) => e1.wavenumber<e2.wavenumber)
* Using MLLib
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
val imgSV = imgSpec.mapValues{ ptarr => Vectors.dense(ptarr) }
val kvData =
// train the model with 4 groups and 1000 iterations
val kModel = KMeans.train(kvData,4,1000)
// create an image from the clusters
val imgClusters = imgSV.mapValues{ cvec => kModel.predict(cvec)}
* Calculate Principal Components
import org.apache.spark.mllib.linalg.{Matrix, Matrices}
import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
val rm = new RowMatrix(kvData)
// pca for first 3 components
val prinComp = rm.computePrincipalComponents(3)
// calcupate projections
val projs: RowMatrix = rm.multiply(pc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment