Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
PDF Highlighter script to read documents from AWS S3

Config and script for PDF Highlighter to get PDF documents from AWS S3.

We attach the script to Highlighter's "getDocumentProviderFn" hook and, when there's parameter 's3File' (our custom param name, can be anything) in highlighting request, we return implementation of Highlighter's IDependency interface.

highlighter {
service {
scripting {
files = ["D:/project-data/s3-pdf-fetch.groovy"]
getDocumentProviderFn = "getDocumentProvider"
}
}
}
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model.*
import groovy.transform.Field
import org.apache.commons.io.IOUtils
import com.jobjects.highlighter.IDependency
//////////////////////////////////////////////////////////////////////
// Configuration - S3 access params
@Field String bucketName = "your-bucket-name"
@Field String myAccessKeyID = "..."
@Field String mySecretKey = "..."
//////////////////////////////////////////////////////////////////////
@Field AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(myAccessKeyID, mySecretKey))
def init() {
log.info("Init script {}", scriptPath)
}
def getDocumentProvider(uri, params) {
// check if there's "s3File" parameter in request
def s3File = params.get("s3File")
if (!s3File) {
// if no, we return null and PDF Highlighter will continue request handling as usual
return null
}
// At this point we have s3File that we'll use to make and return IDependency implementation...
// Parameter "uri" can be anything as PDF Highlighter will not use it
boolean fileRead = false
long lastMod, length
String contentType
def readMeta = {
if (!fileRead) {
try {
long start = System.currentTimeMillis()
ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucketName, s3File)
//log.error("USER META: {}", objectMetadata.getUserMetadata())
//log.error("RAW META: {}", objectMetadata.getRawMetadata())
contentType = objectMetadata.getContentType()
length = objectMetadata.getContentLength()
lastMod = objectMetadata.getLastModified().getTime()
log.info("Read {} meta in {} ms", s3File, System.currentTimeMillis() - start)
fileRead = true
}
catch(e) {
throw new IOException(e)
}
}
}
return [
getInternalDocumentId: { return "s3_" + s3File },
getCanonicalPath: { return "s3_" + s3File },
getLastModified: { readMeta(); return lastMod },
getContentType: { readMeta(); return contentType },
getFileLength: { readMeta(); return (int) length },
getFileBytes: {
long rStart = System.currentTimeMillis()
S3Object s3object = s3Client.getObject(bucketName, s3File)
S3ObjectInputStream inputStream = s3object.getObjectContent()
byte[] bytes = IOUtils.toByteArray(inputStream)
log.info("Read {} content of {} KB in {} ms", s3File, (int) (length / 1024), System.currentTimeMillis() - rStart)
return bytes
},
cleanup: {}
] as IDependency
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.