Last active
January 21, 2017 06:43
-
-
Save iha2/cfa7352754cd64b1bcd6d0f8fce0694c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def processFile = { | |
val lines = importFile(file) | |
val columnHeaders = lines.head.split("\\s *").toVector | |
val vectorLength = columnHeaders.length | |
val rows = lines.tail.zipWithIndex.map { case (x, i) => | |
val cells = x.split("\\s *") | |
cells match { | |
case x if ( x.length != vectorLength ) => { | |
println("There is missing data in row " + i + ". Skipping..") | |
Vector[Option[Double]]() | |
} | |
case _ => { | |
val result = cells.zipWithIndex.map { case (x, j) => | |
validateCells(x, i, j) | |
} | |
result.takeWhile(x => x.isDefined).length match { | |
case y if (y != vectorLength) => Vector[Option[Double]]() | |
case _ => result.toVector | |
} | |
} | |
} | |
}.foldLeft((Vector[Vector[Option[Double]]]()), Vector[Vector[Option[Double]]]()) { (m, n) => { | |
n match { | |
case z: Vector[Option[Double]] if z.isEmpty => { | |
val newFailure = z +: m._1 | |
(newFailure, m._2) | |
} | |
case z if z.nonEmpty => { | |
val newSuccess = z +: m._1 | |
(m._1, newSuccess) | |
} | |
} | |
} | |
} | |
val failureRate = (rows._1.flatMap(x => x).length.toDouble / rows._2.flatMap(x => x).length.toDouble) | |
failureRate match { | |
case x if (x > acceptableFailurePercentage) => throw new Exception(s"The failure rate was $failureRate. It does not pass the success") | |
case _ => { | |
println(s"There were ${rows._2.length} clean lines out of ${lines.tail.length} rows of data. Error Rate: ${failureRate}") | |
new DataContainer(columnHeaders, rows._2, acceptableFailurePercentage) | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment