Skip to content

Instantly share code, notes, and snippets.

Created January 28, 2020 13:26
Show Gist options
  • Save reality/8871727f65b26c35f9880c6ad43284c1 to your computer and use it in GitHub Desktop.
Save reality/8871727f65b26c35f9880c6ad43284c1 to your computer and use it in GitHub Desktop.
some example code to lemmatise a directory of text documents using stanford corenlp
#!/usr/bin/env groovy
@Grab(group='commons-cli', module='commons-cli', version='1.4')
@Grab(group='org.apache.commons', module='commons-lang3', version='3.4')
@Grab(group='edu.stanford.nlp', module='stanford-corenlp', version='3.7.0')
@Grab(group='edu.stanford.nlp', module='stanford-corenlp', version='3.7.0', classifier='models')
@Grab(group='edu.stanford.nlp', module='stanford-parser', version='3.7.0')
import java.util.concurrent.*
import java.util.concurrent.atomic.*
import groovyx.gpars.*
import org.codehaus.gpars.*
import edu.stanford.nlp.pipeline.*
import edu.stanford.nlp.ling.*
import edu.stanford.nlp.semgraph.*
def props = new Properties()
props.put("annotators", "tokenize, ssplit, pos, lemma")
StanfordCoreNLP pipeline = new StanfordCoreNLP(props)
def files = []
new File('./texts').eachFile { files << it }
def i = 0
GParsPool.withPool(85) { p -> // he he he
files.eachParallel{ e ->
def text = e.text
text = text.replaceAll('\n\n', '.\n')
text = text.replaceAll('\u2022', '. ')
text = text.replaceAll('–', '. ')
text = text.replaceAll('-', '. ')
text = text.replaceAll('– ', '. ')
text = text.replaceAll('\\s+', ' ')
text = text.replaceAll(', \\?', '. ?')
text = text.replaceAll('\\.', '. ')
def aDocument = new Annotation(text.toLowerCase())
def newText = ''
aDocument.get(CoreAnnotations.SentencesAnnotation.class).each { sentence ->
newText += sentence.get(CoreAnnotations.TokensAnnotation.class).collect {
}.join(' ') + ' '
newText = newText.replaceAll(' ,', ',')
newText = newText.replaceAll(' \\.', '.')
new File('new_texts/' + e.getName()).text = newText
println "${++i}/${files.size()}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment