Last active
August 29, 2015 14:03
-
-
Save mjhopkins/dce6e8d9d29a18f9c139 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scala.util.matching.Regex | |
import scalaz._ | |
import Scalaz._ | |
import java.io.File | |
/* | |
Process a file containing <key, string, float> columns and print some stats on it. | |
Let's assume the columns are separated by tab characters. | |
*/ | |
object FileStatistics { | |
val threeColumns = List.fill(3)( """([^\t]+)""") | |
val lineRegex = new Regex(threeColumns.mkString("\t") + ".*") | |
def parseColumns(line: String): (String, String, Float) = { | |
val lineRegex(key, string, float) = line | |
(key, string, float.toFloat) | |
} | |
def fileStatistics(file: File) = { | |
def stream = io.Source.fromFile(file).getLines().toStream map parseColumns | |
val (totalChars, totalWords, sum, lineCount, wordCounts) = stream foldMap { | |
case (_, s, f) => | |
val words = s split "\\s+" | |
val wordOccurrences = words.toList map { w => Map(w -> 1) } | |
(s.size, words.size, f, 1, wordOccurrences.suml) | |
} | |
val uniqueWords = wordCounts.keySet.toSeq | |
val (greatestLength, longestWords) = uniqueWords groupBy (_.length) maxBy { case (len, _) => len } | |
val totalWordChars = (wordCounts map { case (w, c) => w.length * c }).sum | |
s""" | |
|Total words $totalWords | |
|Total chars $totalChars | |
|Total unique words ${uniqueWords.size } | |
|Average words per line ${totalWords * 1.0 / lineCount } | |
|Average chars per word ${totalWordChars * 1.0 / totalWords } | |
|Longest word(s) ${longestWords mkString "," } ($greatestLength chars) | |
|Total numeric value $sum | |
|Average numeric value ${sum / lineCount } | |
""".stripMargin | |
} | |
def main(args: Array[String]) { | |
val f = new File("test.txt") | |
println(fileStatistics(f)) | |
} | |
} | |
/* | |
running on a test file containing | |
key1 we all live in a yellow submarine 10.0 | |
key2 red orange yellow green blue indigo violet 20.0 | |
key3 live to eat don't eat to live 30.0 | |
yields | |
Total words 21 | |
Total chars 104 | |
Total unique words 16 | |
Average words per line 7.0 | |
Average chars per word 4.095238095238095 | |
Longest word(s) submarine (9 chars) | |
Total numeric value 60.0 | |
Average numeric value 20.0 | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment