jobjects/00 PDF Highlighter script to read documents from AWS S3.md

## 00 PDF Highlighter script to read documents from AWS S3.md

      
    Raw
  

              00 PDF Highlighter script to read documents from AWS S3.md
            
          
    Config and script for PDF Highlighter to get PDF documents from AWS S3.
We attach the script to Highlighter's "getDocumentProviderFn" hook and, when there's
parameter 's3File' (our custom param name, can be anything) in highlighting request,
we return implementation of Highlighter's IDependency interface.

  
## application.conf
highlighter {
  service {
    scripting {
      files = ["D:/project-data/s3-pdf-fetch.groovy"]
      getDocumentProviderFn = "getDocumentProvider"
    }
  }
}

## s3-pdf-fetch.groovy
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model.*
import groovy.transform.Field
import org.apache.commons.io.IOUtils
import com.jobjects.highlighter.IDependency

//////////////////////////////////////////////////////////////////////
// Configuration - S3 access params
@Field String bucketName = "your-bucket-name"
@Field String myAccessKeyID = "..."
@Field String mySecretKey = "..."
//////////////////////////////////////////////////////////////////////


@Field AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(myAccessKeyID, mySecretKey))

def init() {
    log.info("Init script {}", scriptPath)
}

def getDocumentProvider(uri, params) {

    // check if there's "s3File" parameter in request
    def s3File = params.get("s3File")
    if (!s3File) {
        // if no, we return null and PDF Highlighter will continue request handling as usual
        return null
    }

    // At this point we have s3File that we'll use to make and return IDependency implementation...
    // Parameter "uri" can be anything as PDF Highlighter will not use it

    boolean fileRead = false
    long lastMod, length
    String contentType

    def readMeta = {
        if (!fileRead) {
            try {
                long start = System.currentTimeMillis()
                ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucketName, s3File)
                //log.error("USER META: {}", objectMetadata.getUserMetadata())
                //log.error("RAW META: {}", objectMetadata.getRawMetadata())

                contentType = objectMetadata.getContentType()
                length = objectMetadata.getContentLength()
                lastMod = objectMetadata.getLastModified().getTime()
                log.info("Read {} meta in {} ms", s3File, System.currentTimeMillis() - start)
                fileRead = true
            }
            catch(e) {
                throw new IOException(e)
            }
        }
    }

    return [
            getInternalDocumentId: { return "s3_" + s3File },
            getCanonicalPath: { return "s3_" + s3File },
            getLastModified: { readMeta(); return lastMod },
            getContentType: { readMeta(); return contentType },
            getFileLength: { readMeta(); return (int) length },
            getFileBytes: {
                long rStart = System.currentTimeMillis()
                S3Object s3object = s3Client.getObject(bucketName, s3File)
                S3ObjectInputStream inputStream = s3object.getObjectContent()
                byte[] bytes = IOUtils.toByteArray(inputStream)
                log.info("Read {} content of {} KB in {} ms", s3File, (int) (length / 1024), System.currentTimeMillis() - rStart)
                return bytes
            },
            cleanup: {}
    ] as IDependency
}
	highlighter {
	service {
	scripting {
	files = ["D:/project-data/s3-pdf-fetch.groovy"]
	getDocumentProviderFn = "getDocumentProvider"
	}
	}
	}
	import com.amazonaws.auth.BasicAWSCredentials
	import com.amazonaws.services.s3.AmazonS3Client
	import com.amazonaws.services.s3.model.*
	import groovy.transform.Field
	import org.apache.commons.io.IOUtils
	import com.jobjects.highlighter.IDependency

	//////////////////////////////////////////////////////////////////////
	// Configuration - S3 access params
	@Field String bucketName = "your-bucket-name"
	@Field String myAccessKeyID = "..."
	@Field String mySecretKey = "..."
	//////////////////////////////////////////////////////////////////////


	@Field AmazonS3Client s3Client = new AmazonS3Client(new BasicAWSCredentials(myAccessKeyID, mySecretKey))

	def init() {
	log.info("Init script {}", scriptPath)
	}

	def getDocumentProvider(uri, params) {

	// check if there's "s3File" parameter in request
	def s3File = params.get("s3File")
	if (!s3File) {
	// if no, we return null and PDF Highlighter will continue request handling as usual
	return null
	}

	// At this point we have s3File that we'll use to make and return IDependency implementation...
	// Parameter "uri" can be anything as PDF Highlighter will not use it

	boolean fileRead = false
	long lastMod, length
	String contentType

	def readMeta = {
	if (!fileRead) {
	try {
	long start = System.currentTimeMillis()
	ObjectMetadata objectMetadata = s3Client.getObjectMetadata(bucketName, s3File)
	//log.error("USER META: {}", objectMetadata.getUserMetadata())
	//log.error("RAW META: {}", objectMetadata.getRawMetadata())

	contentType = objectMetadata.getContentType()
	length = objectMetadata.getContentLength()
	lastMod = objectMetadata.getLastModified().getTime()
	log.info("Read {} meta in {} ms", s3File, System.currentTimeMillis() - start)
	fileRead = true
	}
	catch(e) {
	throw new IOException(e)
	}
	}
	}

	return [
	getInternalDocumentId: { return "s3_" + s3File },
	getCanonicalPath: { return "s3_" + s3File },
	getLastModified: { readMeta(); return lastMod },
	getContentType: { readMeta(); return contentType },
	getFileLength: { readMeta(); return (int) length },
	getFileBytes: {
	long rStart = System.currentTimeMillis()
	S3Object s3object = s3Client.getObject(bucketName, s3File)
	S3ObjectInputStream inputStream = s3object.getObjectContent()
	byte[] bytes = IOUtils.toByteArray(inputStream)
	log.info("Read {} content of {} KB in {} ms", s3File, (int) (length / 1024), System.currentTimeMillis() - rStart)
	return bytes
	},
	cleanup: {}
	] as IDependency
	}