Last active
August 29, 2015 14:09
-
-
Save jkinkead/2a3987bc64daddb46701 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.allenai.common.Config._ | |
object ClassificationReader { | |
/** Reads classifications from disk, using the given config to find the classification file(s). | |
* This requires that the config have a key {classificationType}.file for each classification | |
* type. | |
* @throws IllegalArgumentException if any of the required keys are missing | |
*/ | |
def readClassifications(config: Config): Seq[PaperClassification] = { | |
// Read a classification per file. This will generate PaperClassification instances with a | |
// single value. | |
val paperClassifications = for { | |
classification <- ClassificationTypes.Values | |
filePath = config.get[String](s"$classification.file") getOrElse { | |
throw new IllegalArgumentException(s"config is missing key $classification.file") | |
} | |
(paperId, value) <- readFromFile(filePath) | |
} yield PaperClassification(paperId, Map(classification -> value)) | |
// Collapse duplicate entries into a single sequence. | |
for { | |
(paperId, classifications) <- paperClassifications.groupBy(_.paperId) | |
allClassifications = (classifications flatMap { _.classifications }).toMap | |
} yield PaperClassification(paperId, allClassifications) | |
} | |
/** Reads a classification from a file. | |
* @return the mapping of paper ID to classification value | |
* @throws IllegalArgumentException if any line doesn't contain two tab-separated values | |
* @throws NumberFormatException if the second value on any line isn't a Double | |
*/ | |
def readFromFile(file: File): Map[String, Double] = { | |
(Resource.using(Source.fromFile(file)) { source => | |
for { | |
line <- source.getLine | |
values = line.split('\t') | |
} yield values match { | |
case Array(paperId, value) => (paperId -> value.toDouble) | |
case _ => throw new IllegalArgumentException("Malformed line " + line) | |
} | |
}).toMap | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment