Skip to content

Instantly share code, notes, and snippets.

@rahulsom
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rahulsom/89551dc768425a4558ab to your computer and use it in GitHub Desktop.
Save rahulsom/89551dc768425a4558ab to your computer and use it in GitHub Desktop.
CCD Analyzer
/*
* GPars (pronounced Jeepers) is a groovy library that offers multiple concurrency models - executors, parallel
* collections, actors, message passing, etc.
* So you can choose one depending on what you need.
*
* We just happen to use parallel collections
*/
@Grab("org.codehaus.gpars:gpars:1.2.1")
import groovyx.gpars.GParsPool
def sectionFrequency = [:]
def sectionNames = [:]
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles()
/*
* The closure passed here gets the benefits of added methods for GPars concurrency models based on executor pools.
*/
GParsPool.withPool {
arrayOfFiles.eachParallel { File file ->
/*
* You could do `file.text` and that would load the file into a String and then parse it. It ends up being
* a tax on the Garbage Collector. So we prefer using streams.
* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
* If you're referring to an attribute, you prefix it with the `@` sign.
*/
def xml = new XmlSlurper().parse(file.newInputStream())
def sections = xml.component.structuredBody.component.section
sections.each { section ->
def sectionName = section.code.@code.text()
/*
* This is a questionable choice here, i.e. using `synchronized`. However it makes sense given we have
* only 2 or 4 core processors, and we **know** for a fact that updating the hashmap is a very small
* percent of the time in each document, and a majority of time goes into reading and parsing the file.
*
* If we had a 64-core server grade hardware, and/or documents were smaller than they are, our choice
* would differ significantly.
*/
synchronized (sectionFrequency) {
if (sectionFrequency.containsKey(sectionName)) {
sectionFrequency[sectionName]++
} else {
sectionFrequency[sectionName] = 1
sectionNames[sectionName] = section.title.text()
}
}
}
}
}
sectionFrequency.each { k, v ->
println "$k - $v - ${sectionNames[k]}"
}
/*
* We removed GPars because we can use Java 8's native streams on collections and lambdas.
* It makes the code slightly more functional.
*
* What's fun is groovy has implicit coercion of Closures to Lambdas
*/
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()
def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
}
def result = arrayOfFiles.parallelStream().map { File file ->
/*
* You could do `file.text` and that would load the file into a String and then parse it. It ends up being
* a tax on the Garbage Collector. So we prefer using streams.
* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
* If you're referring to an attribute, you prefix it with the `@` sign.
*/
def xml = new XmlSlurper().parse(file.newInputStream())
def sections = xml.component.structuredBody.component.section
sections.collectEntries { section ->
[section.code.@code.text(), section.title.text()]
}
}.reduce([:]) {accum, newMap -> combineMaps(accum, newMap) }.collect {k, v ->
[k, [count: v.size(), title: v.head()] ]
}
result.each {k, v -> println "$k - ${v.count} - ${v.title}" }
/*
* This script uses static typing on our CCDs. This makes sure we don't mistype something, and also lets the IDE give us
* code completion.
*/
@Grab('com.github.rahulsom:ihe-iti:0.8')
import com.github.rahulsom.cda.POCDMT000040ClinicalDocument
import javax.xml.bind.JAXBContext
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()
def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
}
def context = JAXBContext.newInstance(POCDMT000040ClinicalDocument)
def result = arrayOfFiles.parallelStream().map { File file ->
POCDMT000040ClinicalDocument doc = context.createUnmarshaller().unmarshal(file).value
def sections = doc.component.structuredBody.component.section
sections.collectEntries { section ->
[section.code.code, section.title.content.head()]
}
}.reduce([:]) { accum, newMap -> combineMaps(accum, newMap) }.collect { k, v ->
[k,
[count: v.size(),
title: v.head()]
]
}
result.each { k, v -> println "$k - ${v.count} - ${v.title}" }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment