Skip to content

Instantly share code, notes, and snippets.

@jkinkead
Last active August 29, 2015 14:09
Show Gist options
  • Save jkinkead/2a3987bc64daddb46701 to your computer and use it in GitHub Desktop.
Save jkinkead/2a3987bc64daddb46701 to your computer and use it in GitHub Desktop.
import org.allenai.common.Config._
object ClassificationReader {
/** Reads classifications from disk, using the given config to find the classification file(s).
* This requires that the config have a key {classificationType}.file for each classification
* type.
* @throws IllegalArgumentException if any of the required keys are missing
*/
def readClassifications(config: Config): Seq[PaperClassification] = {
// Read a classification per file. This will generate PaperClassification instances with a
// single value.
val paperClassifications = for {
classification <- ClassificationTypes.Values
filePath = config.get[String](s"$classification.file") getOrElse {
throw new IllegalArgumentException(s"config is missing key $classification.file")
}
(paperId, value) <- readFromFile(filePath)
} yield PaperClassification(paperId, Map(classification -> value))
// Collapse duplicate entries into a single sequence.
for {
(paperId, classifications) <- paperClassifications.groupBy(_.paperId)
allClassifications = (classifications flatMap { _.classifications }).toMap
} yield PaperClassification(paperId, allClassifications)
}
/** Reads a classification from a file.
* @return the mapping of paper ID to classification value
* @throws IllegalArgumentException if any line doesn't contain two tab-separated values
* @throws NumberFormatException if the second value on any line isn't a Double
*/
def readFromFile(file: File): Map[String, Double] = {
(Resource.using(Source.fromFile(file)) { source =>
for {
line <- source.getLine
values = line.split('\t')
} yield values match {
case Array(paperId, value) => (paperId -> value.toDouble)
case _ => throw new IllegalArgumentException("Malformed line " + line)
}
}).toMap
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment