jobjects/00 Getting document path from database and obfuscating access URL.md

## 00 Getting document path from database and obfuscating access URL.md

      
    Raw
  

              00 Getting document path from database and obfuscating access URL.md
            
          
    Getting document path from database and obfuscating access URL

Requirements for this exercise are:

Access PDF documents using GUID identifiers.
GUIDs are stored in MuSQL database and point to PDF files on accessible via server file system.
Users should not be able to get direct URL to PDF file.

The approach used to address the requirements:

We extend PDF Highlighter functionality with a custom script.
Document highlighting URLs composed by search application will send GUID as "uri" parameter.
When handling the highlighting request, PDF Highlighter server invokes our script function
"uriToFile" that queries the database, stores file path to short term cache and returns the file
to Highlighter.
We override result document serving path so that HTML5 PDF viewer gets file from PDF Highlighter's
"/cached-document" endpoint, passing request "uri" (which is our GUID).
When handling the "/cached-document" request, PDF Highlighter invokes our "cachedFile" function.
cachedFile will lookup for file path in cache and return if available. Access to file after the expiration
time results with 404 error.


## application.conf
highlighter {

  service {
    scripting {
      files = ["D:/project-data/my-script.groovy"]
      uriFilterFn = "uriToFile"   # hook to our function that transforms uri paramete to PDF file path
      cachedDocumentProviderFn = "cachedFile" # hook to function responsible for locating
    }
  }

  # override default document serving paths so that document is requested from "/cached-document/" endpoint
  serveViewerHighlightedPdf = "{serviceUrl}/viewer/?file=/cached-document/{request.uri:urlencoded}&highlightsFile={serviceUrl:urlencoded}{/hits/:urlencoded}{hitsRef:urlencoded}&nativePrint={viewerUseBrowserPdfPrint}#page={firstHitPage}"
  serveOriginalPdfInViewer = "{serviceUrl}/viewer/?file=/cached-document/{request.uri:urlencoded}&nativePrint={viewerUseBrowserPdfPrint}"
  # override default rules so we use HTML5 PDF viewer for all user agents
  documentServingPathRules = []
  documentServingPath = serveViewerHighlightedPdf

  # ... other settings ...
}


## my-script.groovy
import groovy.sql.Sql
import groovy.transform.Field
import java.util.concurrent.TimeUnit
import com.google.common.cache.Cache
import com.google.common.cache.CacheBuilder

/*
PDF Highlighter initializes global scope with:
    scriptPath - path used to load the script
    log - an instance of org.slf4j.Logger
*/

// global variables
@Field Sql sql
@Field Cache idToFileCache

def init() {
    log.info("Init script {}", scriptPath)

    // init cache for file paths
    idToFileCache = CacheBuilder.newBuilder()
            .maximumSize(1000)
            .expireAfterWrite(60, TimeUnit.SECONDS)
            .expireAfterAccess(30, TimeUnit.SECONDS)
            .build()

    // open database connection (keeping it open while the script is loaded)
    dbConnect()
}

def destroy() {
    log.info("Unloading script {}", scriptPath)
    try {
        if (sql != null)
            sql.close()
    } catch (Exception e) {
        log.info(e.message)
    }
}

def dbConnect() {
    def url = 'jdbc:mysql://localhost:3306/your_database'
    def user = 'root'
    def password = ''
    def driver = 'com.mysql.jdbc.Driver'
    sql = Sql.newInstance(url, user, password, driver)

    log.info('Connected to {}', url)
}

def uriToFile(uri) {
    log.debug("requested pdf for {}", uri)

    def docId = uri, docRow

    // TODO: query your database
    //docRow = sql.firstRow('SELECT file_path FROM documents WHERE id = ?', [docId])

    if (docRow) {
        String path = docRow.file_path
        log.info("PDF path from db: {}", path)

        idToFileCache.put(docId, path)

        // At this point we could return document 'path' (string) and PDF Highlighter would pass it
        // through regex mapping rules (if setup).
        // However, if we return a File instance, Highlighter will use it without further filtering.
        return new File(path)
    }
    else {
        log.warn('Not found file for {} (uri:{})', docId, uri)
    }
}

def cachedFile(id) {
    String path = idToFileCache.getIfPresent(id)
    log.info('File path for {}: {}', id, path)
    if (path != null) {
        return new File(path)
    }
}
	highlighter {

	service {
	scripting {
	files = ["D:/project-data/my-script.groovy"]
	uriFilterFn = "uriToFile" # hook to our function that transforms uri paramete to PDF file path
	cachedDocumentProviderFn = "cachedFile" # hook to function responsible for locating
	}
	}

	# override default document serving paths so that document is requested from "/cached-document/" endpoint
	serveViewerHighlightedPdf = "{serviceUrl}/viewer/?file=/cached-document/{request.uri:urlencoded}&highlightsFile={serviceUrl:urlencoded}{/hits/:urlencoded}{hitsRef:urlencoded}&nativePrint={viewerUseBrowserPdfPrint}#page={firstHitPage}"
	serveOriginalPdfInViewer = "{serviceUrl}/viewer/?file=/cached-document/{request.uri:urlencoded}&nativePrint={viewerUseBrowserPdfPrint}"
	# override default rules so we use HTML5 PDF viewer for all user agents
	documentServingPathRules = []
	documentServingPath = serveViewerHighlightedPdf

	# ... other settings ...
	}
	import groovy.sql.Sql
	import groovy.transform.Field
	import java.util.concurrent.TimeUnit
	import com.google.common.cache.Cache
	import com.google.common.cache.CacheBuilder

	/*
	PDF Highlighter initializes global scope with:
	scriptPath - path used to load the script
	log - an instance of org.slf4j.Logger
	*/

	// global variables
	@Field Sql sql
	@Field Cache idToFileCache

	def init() {
	log.info("Init script {}", scriptPath)

	// init cache for file paths
	idToFileCache = CacheBuilder.newBuilder()
	.maximumSize(1000)
	.expireAfterWrite(60, TimeUnit.SECONDS)
	.expireAfterAccess(30, TimeUnit.SECONDS)
	.build()

	// open database connection (keeping it open while the script is loaded)
	dbConnect()
	}

	def destroy() {
	log.info("Unloading script {}", scriptPath)
	try {
	if (sql != null)
	sql.close()
	} catch (Exception e) {
	log.info(e.message)
	}
	}

	def dbConnect() {
	def url = 'jdbc:mysql://localhost:3306/your_database'
	def user = 'root'
	def password = ''
	def driver = 'com.mysql.jdbc.Driver'
	sql = Sql.newInstance(url, user, password, driver)

	log.info('Connected to {}', url)
	}

	def uriToFile(uri) {
	log.debug("requested pdf for {}", uri)

	def docId = uri, docRow

	// TODO: query your database
	//docRow = sql.firstRow('SELECT file_path FROM documents WHERE id = ?', [docId])

	if (docRow) {
	String path = docRow.file_path
	log.info("PDF path from db: {}", path)

	idToFileCache.put(docId, path)

	// At this point we could return document 'path' (string) and PDF Highlighter would pass it
	// through regex mapping rules (if setup).
	// However, if we return a File instance, Highlighter will use it without further filtering.
	return new File(path)
	}
	else {
	log.warn('Not found file for {} (uri:{})', docId, uri)
	}
	}

	def cachedFile(id) {
	String path = idToFileCache.getIfPresent(id)
	log.info('File path for {}: {}', id, path)
	if (path != null) {
	return new File(path)
	}
	}