Last active
August 29, 2015 14:08
-
-
Save rahulsom/89551dc768425a4558ab to your computer and use it in GitHub Desktop.
CCD Analyzer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* GPars (pronounced Jeepers) is a groovy library that offers multiple concurrency models - executors, parallel | |
* collections, actors, message passing, etc. | |
* So you can choose one depending on what you need. | |
* | |
* We just happen to use parallel collections | |
*/ | |
@Grab("org.codehaus.gpars:gpars:1.2.1") | |
import groovyx.gpars.GParsPool | |
def sectionFrequency = [:] | |
def sectionNames = [:] | |
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles() | |
/* | |
* The closure passed here gets the benefits of added methods for GPars concurrency models based on executor pools. | |
*/ | |
GParsPool.withPool { | |
arrayOfFiles.eachParallel { File file -> | |
/* | |
* You could do `file.text` and that would load the file into a String and then parse it. It ends up being | |
* a tax on the Garbage Collector. So we prefer using streams. | |
* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword). | |
* If you're referring to an attribute, you prefix it with the `@` sign. | |
*/ | |
def xml = new XmlSlurper().parse(file.newInputStream()) | |
def sections = xml.component.structuredBody.component.section | |
sections.each { section -> | |
def sectionName = section.code.@code.text() | |
/* | |
* This is a questionable choice here, i.e. using `synchronized`. However it makes sense given we have | |
* only 2 or 4 core processors, and we **know** for a fact that updating the hashmap is a very small | |
* percent of the time in each document, and a majority of time goes into reading and parsing the file. | |
* | |
* If we had a 64-core server grade hardware, and/or documents were smaller than they are, our choice | |
* would differ significantly. | |
*/ | |
synchronized (sectionFrequency) { | |
if (sectionFrequency.containsKey(sectionName)) { | |
sectionFrequency[sectionName]++ | |
} else { | |
sectionFrequency[sectionName] = 1 | |
sectionNames[sectionName] = section.title.text() | |
} | |
} | |
} | |
} | |
} | |
sectionFrequency.each { k, v -> | |
println "$k - $v - ${sectionNames[k]}" | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* We removed GPars because we can use Java 8's native streams on collections and lambdas. | |
* It makes the code slightly more functional. | |
* | |
* What's fun is groovy has implicit coercion of Closures to Lambdas | |
*/ | |
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList() | |
def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) { | |
accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] } | |
} | |
def result = arrayOfFiles.parallelStream().map { File file -> | |
/* | |
* You could do `file.text` and that would load the file into a String and then parse it. It ends up being | |
* a tax on the Garbage Collector. So we prefer using streams. | |
* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword). | |
* If you're referring to an attribute, you prefix it with the `@` sign. | |
*/ | |
def xml = new XmlSlurper().parse(file.newInputStream()) | |
def sections = xml.component.structuredBody.component.section | |
sections.collectEntries { section -> | |
[section.code.@code.text(), section.title.text()] | |
} | |
}.reduce([:]) {accum, newMap -> combineMaps(accum, newMap) }.collect {k, v -> | |
[k, [count: v.size(), title: v.head()] ] | |
} | |
result.each {k, v -> println "$k - ${v.count} - ${v.title}" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This script uses static typing on our CCDs. This makes sure we don't mistype something, and also lets the IDE give us | |
* code completion. | |
*/ | |
@Grab('com.github.rahulsom:ihe-iti:0.8') | |
import com.github.rahulsom.cda.POCDMT000040ClinicalDocument | |
import javax.xml.bind.JAXBContext | |
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList() | |
def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) { | |
accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] } | |
} | |
def context = JAXBContext.newInstance(POCDMT000040ClinicalDocument) | |
def result = arrayOfFiles.parallelStream().map { File file -> | |
POCDMT000040ClinicalDocument doc = context.createUnmarshaller().unmarshal(file).value | |
def sections = doc.component.structuredBody.component.section | |
sections.collectEntries { section -> | |
[section.code.code, section.title.content.head()] | |
} | |
}.reduce([:]) { accum, newMap -> combineMaps(accum, newMap) }.collect { k, v -> | |
[k, | |
[count: v.size(), | |
title: v.head()] | |
] | |
} | |
result.each { k, v -> println "$k - ${v.count} - ${v.title}" } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment