Skip to content

Instantly share code, notes, and snippets.

@mjhopkins
Last active August 29, 2015 14:03
Show Gist options
  • Save mjhopkins/dce6e8d9d29a18f9c139 to your computer and use it in GitHub Desktop.
Save mjhopkins/dce6e8d9d29a18f9c139 to your computer and use it in GitHub Desktop.
import scala.util.matching.Regex
import scalaz._
import Scalaz._
import java.io.File
/*
Process a file containing <key, string, float> columns and print some stats on it.
Let's assume the columns are separated by tab characters.
*/
object FileStatistics {
val threeColumns = List.fill(3)( """([^\t]+)""")
val lineRegex = new Regex(threeColumns.mkString("\t") + ".*")
def parseColumns(line: String): (String, String, Float) = {
val lineRegex(key, string, float) = line
(key, string, float.toFloat)
}
def fileStatistics(file: File) = {
def stream = io.Source.fromFile(file).getLines().toStream map parseColumns
val (totalChars, totalWords, sum, lineCount, wordCounts) = stream foldMap {
case (_, s, f) =>
val words = s split "\\s+"
val wordOccurrences = words.toList map { w => Map(w -> 1) }
(s.size, words.size, f, 1, wordOccurrences.suml)
}
val uniqueWords = wordCounts.keySet.toSeq
val (greatestLength, longestWords) = uniqueWords groupBy (_.length) maxBy { case (len, _) => len }
val totalWordChars = (wordCounts map { case (w, c) => w.length * c }).sum
s"""
|Total words $totalWords
|Total chars $totalChars
|Total unique words ${uniqueWords.size }
|Average words per line ${totalWords * 1.0 / lineCount }
|Average chars per word ${totalWordChars * 1.0 / totalWords }
|Longest word(s) ${longestWords mkString "," } ($greatestLength chars)
|Total numeric value $sum
|Average numeric value ${sum / lineCount }
""".stripMargin
}
def main(args: Array[String]) {
val f = new File("test.txt")
println(fileStatistics(f))
}
}
/*
running on a test file containing
key1 we all live in a yellow submarine 10.0
key2 red orange yellow green blue indigo violet 20.0
key3 live to eat don't eat to live 30.0
yields
Total words 21
Total chars 104
Total unique words 16
Average words per line 7.0
Average chars per word 4.095238095238095
Longest word(s) submarine (9 chars)
Total numeric value 60.0
Average numeric value 20.0
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment