Skip to content

Instantly share code, notes, and snippets.

@reality
Created January 16, 2020 22:23
Show Gist options
  • Save reality/56c3e7e8f4c862ce4d40e7e10ce716e8 to your computer and use it in GitHub Desktop.
Save reality/56c3e7e8f4c862ce4d40e7e10ce716e8 to your computer and use it in GitHub Desktop.
@Grab('com.xlson.groovycsv:groovycsv:1.3')
@Grab(group='org.apache.commons', module='commons-lang3', version='3.4')
@Grab(group='edu.stanford.nlp', module='stanford-corenlp', version='3.7.0')
@Grab(group='edu.stanford.nlp', module='stanford-corenlp', version='3.7.0', classifier='models')
@Grab(group='edu.stanford.nlp', module='stanford-parser', version='3.7.0')
import static com.xlson.groovycsv.CsvParser.parseCsv
import edu.stanford.nlp.pipeline.*
import edu.stanford.nlp.ling.*
import edu.stanford.nlp.semgraph.*
import org.apache.commons.lang3.RandomUtils
println "Loading file... "
def entries = []
def file = new RandomAccessFile('./NOTEEVENTS.csv', 'r')
println "File loaded. Beginning selection sequence..."
while(entries.size() < 1000) {
def rPos = RandomUtils.nextLong(new Long(0), file.length())
file.seek(rPos)
def foundNewRecord
def newText
while(!foundNewRecord) {
newText = file.readLine()
if(newText.indexOf('",') != -1) {
foundNewRecord = true
}
}
foundNewRecord = false
def textRecord = ''
while(!foundNewRecord) {
newText = file.readLine()
if(newText.indexOf('",') != -1) {
foundNewRecord = true
} else {
textRecord += newText
}
}
if(textRecord.indexOf(' ') == -1) {
entries << textRecord.replaceAll('\n', '').replaceAll('\\s+', ' ').replaceAll('\\.', '. ')
}
println entries.size()
}
println "Selecting sentences...."
def props = new Properties()
props.put("annotators", "tokenize, ssplit")
coreNLP = new StanfordCoreNLP(props)
pipeline = new AnnotationPipeline()
pipeline.addAnnotator(coreNLP)
def i = 0
def sentences = entries.collect { entry ->
def aDocument = new Annotation(entry.toLowerCase())
pipeline.annotate(aDocument)
println "${++i}"
def s = aDocument.get(CoreAnnotations.SentencesAnnotation.class).collect { it.toString() }
//s = s.findAll { it.indexOf('\t') == -1 && it.indexOf('---') == -1 && it.tokenize(' ').size() < 30 }
//def rPos = RandomUtils.nextInt(0, s.size())
//s[rPos]*/
s
}.flatten()
sentences.removeAll([null])
new File('sentences_healtac.txt').text = sentences.join('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment