Last active
January 19, 2016 17:16
-
-
Save anuragkh/e0a5d40d760c4fb939c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import edu.berkeley.cs.succinct._ | |
// "Succinct"-ify the text | |
val text = sc.textFile("/tmp/tmp.txt").map(_.getBytes) | |
val succinctText = text.succinct | |
// Read and filter annotations for "sentence" type | |
val annot = sc.textFile("/tmp/tmp.annot") | |
val sentenceAnnot = annot.filter(a => a contains "sentence") | |
val sentenceRanges = sentenceAnnot.map(a => a split '^').map(e => (e(0), (e(3).toLong, e(4).toLong))) | |
// Perform regex searches, and union them | |
val res1 = succinctText.regexSearch("(remains|is|still) (unknown|unclear|uncertain)") | |
val res2 = succinctText.regexSearch("(further|more|additional) (research|study|investigation)") | |
val res = res1.union(res2).map(r => (r.begin(), r.end())).collect | |
// Define a function to check if a range is "within" another | |
def within(r1: (Long, Long), r2: (Long, Long)): Boolean = (r1._1 >= r2._1 && r1._2 <= r2._2) | |
// Define a function to check if any one range in a collection of ranges is within another | |
def withinAny(r: Array[(Long, Long)], r2: (Long, Long)): Boolean = { | |
var count = 0 | |
r.foreach(r1 => if (within(r1, r2)) { count += 1 }) | |
count > 0 | |
} | |
// Find the sentence ranges which contain the ranges corresponding to regex matches | |
// Note: This can also be replaced by joins between two RDDs | |
// (corresponding to sentence rangges and regex matches) if | |
// the recall for both are large. | |
val openProbRanges = sentenceRanges.filter(sr => withinAny(res, sr._2)) | |
// Do the same to filter out ranges corresponding to Breast Cancer | |
val bcEntries = succinctText.regexSearch("(breast cancer)|DCIS|(carcinoma of the breast)").map(e => (e.begin(), e.end())) | |
val bcRanges = openProbRanges.filter(opr => withinAny(bcEntries, opr)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment