rahulsom/FirstPass.groovy

## FirstPass.groovy
/*
 * GPars (pronounced Jeepers) is a groovy library that offers multiple concurrency models - executors, parallel
 * collections, actors, message passing, etc.
 * So you can choose one depending on what you need.
 *
 * We just happen to use parallel collections
 */
@Grab("org.codehaus.gpars:gpars:1.2.1")
import groovyx.gpars.GParsPool

def sectionFrequency = [:]
def sectionNames = [:]
def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles()

/*
 * The closure passed here gets the benefits of added methods for GPars concurrency models based on executor pools.
 */
GParsPool.withPool {
    arrayOfFiles.eachParallel { File file ->
        /*
         * You could do `file.text` and that would load the file into a String and then parse it. It ends up being
         * a tax on the Garbage Collector. So we prefer using streams.
         * When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
         * If you're referring to an attribute, you prefix it with the `@` sign.
         */
        def xml = new XmlSlurper().parse(file.newInputStream())
        def sections = xml.component.structuredBody.component.section

        sections.each { section ->
            def sectionName = section.code.@code.text()
            /*
             * This is a questionable choice here, i.e. using `synchronized`. However it makes sense given we have
             * only 2 or 4 core processors, and we **know** for a fact that updating the hashmap is a very small
             * percent of the time in each document, and a majority of time goes into reading and parsing the file.
             *
             * If we had a 64-core server grade hardware, and/or documents were smaller than they are, our choice
             * would differ significantly.
             */
            synchronized (sectionFrequency) {
                if (sectionFrequency.containsKey(sectionName)) {
                    sectionFrequency[sectionName]++
                } else {
                    sectionFrequency[sectionName] = 1
                    sectionNames[sectionName] = section.title.text()
                }
            }
        }
    }
}

sectionFrequency.each { k, v ->
    println "$k - $v - ${sectionNames[k]}"
}

## SecondPass.groovy
/*
 * We removed GPars because we can use Java 8's native streams on collections and lambdas.
 * It makes the code slightly more functional.
 *
 * What's fun is groovy has implicit coercion of Closures to Lambdas
 */

def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()

def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
    accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
}

def result = arrayOfFiles.parallelStream().map { File file ->
    /*
     * You could do `file.text` and that would load the file into a String and then parse it. It ends up being
     * a tax on the Garbage Collector. So we prefer using streams.
     * When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
     * If you're referring to an attribute, you prefix it with the `@` sign.
     */
    def xml = new XmlSlurper().parse(file.newInputStream())
    def sections = xml.component.structuredBody.component.section

    sections.collectEntries { section ->
        [section.code.@code.text(), section.title.text()]
    }
}.reduce([:]) {accum, newMap -> combineMaps(accum, newMap) }.collect {k, v ->
    [k, [count: v.size(), title: v.head()] ]
}

result.each {k, v -> println "$k - ${v.count} - ${v.title}" }

## ThirdPass.groovy
/*
 * This script uses static typing on our CCDs. This makes sure we don't mistype something, and also lets the IDE give us
 * code completion.
 */
@Grab('com.github.rahulsom:ihe-iti:0.8')
import com.github.rahulsom.cda.POCDMT000040ClinicalDocument
import javax.xml.bind.JAXBContext

def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()

def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
    accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
}
def context = JAXBContext.newInstance(POCDMT000040ClinicalDocument)
def result = arrayOfFiles.parallelStream().map { File file ->
    POCDMT000040ClinicalDocument doc = context.createUnmarshaller().unmarshal(file).value
    def sections = doc.component.structuredBody.component.section

    sections.collectEntries { section ->
        [section.code.code, section.title.content.head()]
    }
}.reduce([:]) { accum, newMap -> combineMaps(accum, newMap) }.collect { k, v ->
    [k,
     [count: v.size(),
      title: v.head()]
    ]
}

result.each { k, v -> println "$k - ${v.count} - ${v.title}" }
	/*
	* GPars (pronounced Jeepers) is a groovy library that offers multiple concurrency models - executors, parallel
	* collections, actors, message passing, etc.
	* So you can choose one depending on what you need.
	*
	* We just happen to use parallel collections
	*/
	@Grab("org.codehaus.gpars:gpars:1.2.1")
	import groovyx.gpars.GParsPool

	def sectionFrequency = [:]
	def sectionNames = [:]
	def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles()

	/*
	* The closure passed here gets the benefits of added methods for GPars concurrency models based on executor pools.
	*/
	GParsPool.withPool {
	arrayOfFiles.eachParallel { File file ->
	/*
	* You could do `file.text` and that would load the file into a String and then parse it. It ends up being
	* a tax on the Garbage Collector. So we prefer using streams.
	* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
	* If you're referring to an attribute, you prefix it with the `@` sign.
	*/
	def xml = new XmlSlurper().parse(file.newInputStream())
	def sections = xml.component.structuredBody.component.section

	sections.each { section ->
	def sectionName = section.code.@code.text()
	/*
	* This is a questionable choice here, i.e. using `synchronized`. However it makes sense given we have
	* only 2 or 4 core processors, and we know for a fact that updating the hashmap is a very small
	* percent of the time in each document, and a majority of time goes into reading and parsing the file.
	*
	* If we had a 64-core server grade hardware, and/or documents were smaller than they are, our choice
	* would differ significantly.
	*/
	synchronized (sectionFrequency) {
	if (sectionFrequency.containsKey(sectionName)) {
	sectionFrequency[sectionName]++
	} else {
	sectionFrequency[sectionName] = 1
	sectionNames[sectionName] = section.title.text()
	}
	}
	}
	}
	}

	sectionFrequency.each { k, v ->
	println "$k - $v - ${sectionNames[k]}"
	}
	/*
	* We removed GPars because we can use Java 8's native streams on collections and lambdas.
	* It makes the code slightly more functional.
	*
	* What's fun is groovy has implicit coercion of Closures to Lambdas
	*/

	def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()

	def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
	accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
	}

	def result = arrayOfFiles.parallelStream().map { File file ->
	/*
	* You could do `file.text` and that would load the file into a String and then parse it. It ends up being
	* a tax on the Garbage Collector. So we prefer using streams.
	* When you have an element in an xml, you typically just use the element name as is (unless it is a keyword).
	* If you're referring to an attribute, you prefix it with the `@` sign.
	*/
	def xml = new XmlSlurper().parse(file.newInputStream())
	def sections = xml.component.structuredBody.component.section

	sections.collectEntries { section ->
	[section.code.@code.text(), section.title.text()]
	}
	}.reduce([:]) {accum, newMap -> combineMaps(accum, newMap) }.collect {k, v ->
	[k, [count: v.size(), title: v.head()] ]
	}

	result.each {k, v -> println "$k - ${v.count} - ${v.title}" }
	/*
	* This script uses static typing on our CCDs. This makes sure we don't mistype something, and also lets the IDE give us
	* code completion.
	*/
	@Grab('com.github.rahulsom:ihe-iti:0.8')
	import com.github.rahulsom.cda.POCDMT000040ClinicalDocument
	import javax.xml.bind.JAXBContext

	def arrayOfFiles = new File("/Users/rahul/data/certify-ccds").listFiles().toList()

	def <K, V> Map<K, V> combineMaps(Map<K, List<V>> accum, Map<K, V> b) {
	accum + b.collectEntries { k, v -> [k, (accum[k] ?: []) + v] }
	}
	def context = JAXBContext.newInstance(POCDMT000040ClinicalDocument)
	def result = arrayOfFiles.parallelStream().map { File file ->
	POCDMT000040ClinicalDocument doc = context.createUnmarshaller().unmarshal(file).value
	def sections = doc.component.structuredBody.component.section

	sections.collectEntries { section ->
	[section.code.code, section.title.content.head()]
	}
	}.reduce([:]) { accum, newMap -> combineMaps(accum, newMap) }.collect { k, v ->
	[k,
	[count: v.size(),
	title: v.head()]
	]
	}

	result.each { k, v -> println "$k - ${v.count} - ${v.title}" }