Classify/Merge Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Rules> | |
<!--<Rule> | |
<Comment>Optional comment for logging</Comment> | |
<NewDocType>Doc type to set document to if all rules pass</NewDocType> | |
<SourceDocType>Optional document type filter, separate multiple doc types with a pipe '|'</SourceDocType> | |
<Priority>Rule priority, higher numbers take precedence, in case of a tie whichever comes first wins</Priority> | |
<Criteria> | |
<Criterion> | |
<Type>hocr (default), doctype, plf/kvpp, dlf/kve, file, path, email</Type> | |
<Page>Page number within document, default is 1</Page> | |
<FieldName>Name of page or document level field or email header</FieldName> | |
<Operator>equals (default), same, distance, matches, not-matches, contains, not-contains, has-value, not-has-value, starts-with, or ends-with</Operator> | |
<Value>Value to compare criteria with (not to be used with same, has-value, or not-has-value)</Value> | |
</Criterion> | |
</Criteria> | |
</Rule>--> | |
<Rule> | |
<Comment>Classify Based on HOCR</Comment> | |
<NewDocType>ExampleDocType</NewDocType> | |
<Priority>50</Priority> | |
<Criteria> | |
<Criterion> | |
<Type>hocr</Type> | |
<Operator>matches</Operator> | |
<Value>(?i)^.*(some-domain.com|\bSome Other Key-phrase\b).*$</Value> | |
</Criterion> | |
<Criterion> | |
<Type>hocr</Type> | |
<Operator>not-matches</Operator> | |
<Value>(?i)^.*(Page can't contain this text. Remove if not necessary).*$</Value> | |
</Criterion> | |
</Criteria> | |
</Rule> | |
<Rule> | |
<Comment>Classify Based on Email Header</Comment> | |
<NewDocType>ExampleDocType</NewDocType> | |
<Priority>100</Priority> | |
<Criteria> | |
<Criterion> | |
<Type>email</Type> | |
<FieldName>From</FieldName> | |
<Operator>matches</Operator> | |
<Value>^(?i).*@some-domain.com.*$</Value> | |
</Criterion> | |
</Criteria> | |
</Rule> | |
</Rules> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Rules> | |
<!--<Rule> | |
<Comment>Optional Comment for Logging</Comment> | |
<FirstDoc>1 (default) or 2</FirstDoc> | |
<Criteria> | |
<Criterion> | |
<Type>hocr (default), doctype, plf/kvpp, dlf/kve, file, path, email</Type> | |
<DocumentNumber>1, 2, or blank for both</DocumentNumber> | |
<FieldName>Name of page or document level field or email header</FieldName> | |
<Operator>equals (default), same, distance, matches, not-matches, contains, not-contains, has-value, not-has-value, starts-with, or ends-with</Operator> | |
<Value>Value to compare criteria with (not to be used with same, has-value, or not-has-value)</Value> | |
</Criterion> | |
</Criteria> | |
</Rule>--> | |
<Rule> | |
<Comment>Merge pages with little or no content to previous doc</Comment> | |
<FirstDoc>1</FirstDoc> | |
<Criteria> | |
<Criterion> | |
<Type>hocr</Type> | |
<DocumentNumber>2</DocumentNumber> | |
<Operator>matches</Operator> | |
<Value>^.{0,200}$</Value> | |
</Criterion> | |
</Criteria> | |
</Rule> | |
</Rules> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import com.ephesoft.dcma.script.IJDomScript; | |
import com.ephesoft.dcma.util.logger.EphesoftLogger; | |
import com.ephesoft.dcma.util.logger.ScriptLoggerFactory; | |
import org.apache.commons.io.FileUtils; | |
import org.jdom.*; | |
import org.jdom.input.SAXBuilder; | |
import org.jdom.output.Format; | |
import org.jdom.output.XMLOutputter; | |
import org.jdom.xpath.XPath; | |
import javax.naming.Context; | |
import javax.naming.InitialContext; | |
import javax.sql.DataSource; | |
import java.io.File; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.StringReader; | |
import java.sql.Connection; | |
import java.sql.ResultSet; | |
import java.sql.SQLException; | |
import java.sql.Statement; | |
import java.util.*; | |
public class ScriptDocumentAssembler implements IJDomScript { | |
private static EphesoftLogger LOGGER = ScriptLoggerFactory.getLogger(ScriptDocumentAssembler.class); | |
private String SCRIPT_NAME = this.getClass().getSimpleName(); | |
private boolean IS_DEBUG = System.getProperty("os.name").matches("(?i)Mac OS X|Windows 10"); | |
private String BATCHNAME_VALUE; | |
private String BATCHLOCALPATH_VALUE; | |
private String BATCHUNCPATH_VALUE; | |
private String BATCHINSTANCEID_VALUE; | |
private String BATCHCLASSID_VALUE; | |
private String PATH_TO_SCRIPT_CONFIG; | |
private String SCRIPT_CONFIG_FILE; | |
private Properties SCRIPT_PROPS; | |
private Element ROOT; | |
private Connection DB_CONNECTION = null; | |
public Object execute(Document documentFile, String methodName, String documentIdentifier) { | |
Exception exception = null; | |
printDebug("************* Start execution of the " + SCRIPT_NAME); | |
try { | |
if (null == documentFile) { | |
throw (new Exception("Input document is null.")); | |
} | |
init(documentFile); | |
printDebug("************* Batch " + BATCHINSTANCEID_VALUE + " - " + BATCHNAME_VALUE); | |
if (Boolean.parseBoolean(getPropValue("RunClassificationRules", "true"))) { | |
classify(getPropValue("ClassificationConfig", "classify.xml")); | |
} | |
if (Boolean.parseBoolean(getPropValue("RunMergeRules", "true"))) { | |
merge(getPropValue("MergeConfig", "merge.xml")); | |
} | |
reassignDocIdentifier(); | |
if (IS_DEBUG) writeXmlFile(System.getProperty("user.home") + "/Downloads/" + SCRIPT_NAME + "-out.xml", documentFile); | |
} catch (Exception e) { | |
exception = e; | |
if (IS_DEBUG) e.printStackTrace(); | |
else LOGGER.error("ERROR!!! - " + e.getMessage()); | |
} finally { | |
if (DB_CONNECTION != null) { | |
try { DB_CONNECTION.close(); } catch (SQLException e) { e.printStackTrace(); } | |
} | |
} | |
printDebug("************* End execution of the " + SCRIPT_NAME); | |
return exception; | |
} | |
@SuppressWarnings("unchecked") | |
private void classify(String configName) throws JDOMException, IOException, SQLException { | |
printDebug("***** Begin Classification *****"); | |
String configPath = concatPath(PATH_TO_SCRIPT_CONFIG, configName); | |
// Check for classification xml | |
if (!fileExists(configPath)) { | |
printDebug(configName, "not found!"); | |
return; | |
} | |
printDebug(configName, "found"); | |
// Parse xml for classification rules | |
Element config = readXmlFile(configPath).getRootElement(); | |
// Loop through documents and classify pages | |
List<Element> docList = XPath.newInstance("//Document").selectNodes(ROOT); | |
for (Element doc : docList) { | |
String docId = doc.getChildText(XMLGenerics.IDENTIFIER); | |
printDebug("Comparing", docId, "to classification rules"); | |
String docType = doc.getChildText(XMLGenerics.TYPE); | |
List<Element> classifyRules = config.getChildren("Rule"); | |
String newDocType = ""; | |
String newDocTypeDesc = ""; | |
int newDocTypeConfThresh = 100; | |
int priority = 0; | |
// Loop through classification rules | |
for (Element rule : classifyRules) { | |
// NewDocType | |
String ruleNewDocType = getChildText(rule, "NewDocType"); | |
if (ruleNewDocType.isEmpty()) { | |
printDebug("NewDocType is not set, skipping rule"); | |
continue; | |
} | |
String comment = getChildText(rule, "Comment"); | |
if (!comment.isEmpty()) printDebug("Rule:", comment); | |
// SourceDocType | |
if (!getChildText(rule, "SourceDocType").isEmpty()) { | |
List<String> sourceDocTypes = Arrays.asList(getChildText(rule, "SourceDocType").split("\\|")); | |
if (!sourceDocTypes.contains(docType)) { | |
printDebug("The document type", docType, "does not match the rule's doc type(s):", String.join(", ", sourceDocTypes)); | |
continue; | |
} | |
} | |
// Priority | |
int rulePriority = (!getChildText(rule, "Priority").isEmpty() ? | |
Integer.parseInt(getChildText(rule, "Priority")) : 1); | |
if (rulePriority < priority) { | |
printDebug("Rule priority lower than highest matching priority, skipping rule"); | |
continue; | |
} | |
boolean criteriaMet = true; | |
// Loop through all criteria for classification rule | |
List<Element> criteria = XPath.newInstance(".//Criterion").selectNodes(rule); | |
for (Element criterion : criteria) { | |
String criterionType = getChildText(criterion, "Type"); | |
String criterionPage = (!getChildText(criterion, "Page").isEmpty() ? getChildText(criterion, "Page") : "1"); | |
String criterionField = getChildText(criterion, "FieldName"); | |
String criterionOperator = getChildText(criterion, "Operator"); | |
String criterionValue = getChildText(criterion, "Value"); | |
// Comparison source | |
String source = getDocDetail(doc, criterionType, criterionField, criterionPage); | |
// Check if criterion met | |
criteriaMet = checkCriteria(criterionOperator, criterionValue, source); | |
printDebug("Criterion:", docId, "\"" + source + "\"", criterionOperator, | |
(!criterionOperator.contains("has-value") ? "\"" + criterionValue + "\"." : "-"), | |
(criteriaMet ? "Passed!" : "Failed")); | |
if (!criteriaMet) break; | |
} | |
if (!criteriaMet) { | |
printDebug("Criteria not met."); | |
continue; | |
} | |
if (rulePriority <= priority) { | |
printDebug("Criteria met, but rule priority lower than previous rule(s)"); | |
continue; | |
} | |
printDebug("All criteria met and rule is highest priority so far: ", rulePriority); | |
newDocType = ruleNewDocType; | |
newDocTypeDesc = getDocTypeDescription(ruleNewDocType); | |
newDocTypeConfThresh = getDocTypeConfidenceThreshold(ruleNewDocType); | |
priority = rulePriority; | |
} | |
if (newDocType.isEmpty()) continue; | |
// At least one classification rule passed, setting doc type to doc type of highest priority rule | |
printDebug("Setting", docId, "to:", newDocType); | |
doc.getChild(XMLGenerics.TYPE).setText(newDocType); | |
doc.getChild(XMLGenerics.DESCRIPTION).setText(newDocTypeDesc); | |
doc.getChild(XMLGenerics.CONFIDENCE_THRESHOLD).setText(newDocTypeConfThresh + ".0"); | |
if (Boolean.parseBoolean(getPropValue("SetConfidenceToPriority", "true"))) { | |
setChildText(doc, XMLGenerics.CONFIDENCE, priority + ".00"); | |
} else { | |
setChildText(doc, XMLGenerics.CONFIDENCE, "100.00"); | |
} | |
} | |
} | |
@SuppressWarnings("unchecked") | |
private void merge(String configName) throws JDOMException, IOException { | |
printDebug("***** Begin Merges *****"); | |
String configPath = concatPath(PATH_TO_SCRIPT_CONFIG, configName); | |
// Check for merge xml | |
if (!fileExists(configPath)) { | |
printDebug(configName, "not found!"); | |
return; | |
} | |
printDebug(configName, "found"); | |
// Parse xml for merge rules | |
Element config = readXmlFile(configPath).getRootElement(); | |
// Loop through rules and merge pages | |
List<Element> rules = config.getChildren("Rule"); | |
for (Element rule : rules) { | |
String ruleComment = getChildText(rule, "Comment"); | |
String firstDoc = getChildText(rule, "FirstDoc"); | |
if (!ruleComment.isEmpty()) printDebug("--", ruleComment, "--"); | |
List<Element> docList = XPath.newInstance("//Document").selectNodes(ROOT); | |
// Loop through all documents for each rule | |
for (int i = 0; i < docList.size() - 1; i++) { | |
Element doc1 = docList.get(i); | |
Element doc2 = docList.get(i + 1); | |
String docId1 = doc1.getChildText(XMLGenerics.IDENTIFIER); | |
String docId2 = doc2.getChildText(XMLGenerics.IDENTIFIER); | |
boolean criteriaMet = true; | |
// Check all criteria against the documents | |
List<Element> criteria = rule.getChild("Criteria").getChildren("Criterion"); | |
for (Element criterion : criteria) { | |
String criterionType = getChildText(criterion, "Type"); | |
String criterionDocNumber = getChildText(criterion, "DocumentNumber"); | |
String criterionField = getChildText(criterion, "FieldName"); | |
String criterionOperator = getChildText(criterion, "Operator"); | |
String criterionValue = getChildText(criterion, "Value"); | |
// Get source value for each document | |
String source1 = getDocDetail(doc1, criterionType, criterionField, null); | |
String source2 = getDocDetail(doc2, criterionType, criterionField, null); | |
// Check if criterion met | |
if (criterionOperator.equals("same")) { | |
criteriaMet = checkCriteria(criterionOperator, criterionValue, source1, source2); | |
} else { | |
if (criterionDocNumber.equals("1")) { | |
criteriaMet = checkCriteria(criterionOperator, criterionValue, source1); | |
} else if (criterionDocNumber.equals("2")) { | |
criteriaMet = checkCriteria(criterionOperator, criterionValue, source2); | |
} else { | |
if (criterionValue.isEmpty()) criteriaMet = checkCriteria(criterionOperator, source1, source1, source2); | |
else criteriaMet = checkCriteria(criterionOperator, criterionValue, source1, source2); | |
} | |
} | |
// Log results | |
if (!criterionOperator.matches("same|.*has-value|distance")) { | |
if (criterionDocNumber.equals("1")) { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", criterionOperator, "\"" + criterionValue + "\".", (criteriaMet ? "Passed!" : "Failed")); | |
} else if (criterionDocNumber.equals("2")) { | |
printDebug("Criterion:", docId2, "\"" + source2 + "\"", criterionOperator, "\"" + criterionValue + "\".", (criteriaMet ? "Passed!" : "Failed")); | |
} else { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", "and", docId2, "\"" + source2 + "\"", criterionOperator, "\"" + criterionValue + "\".", (criteriaMet ? "Passed!" : "Failed")); | |
} | |
} else if (criterionOperator.equals("same")) { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", "and", docId2, "\"" + source2 + "\"", "are the same.", (criteriaMet ? "Passed!" : "Failed")); | |
} else if (criterionOperator.equals("distance")) { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", "and", docId2, "\"" + source2 + "\"", "distance less than or equal to", criterionValue + ".", (criteriaMet ? "Passed!" : "Failed")); | |
} else if (criterionOperator.matches(".*has-value")) { | |
if (criterionDocNumber.equals("1")) { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", criterionOperator + ".", (criteriaMet ? "Passed!" : "Failed")); | |
} else if (criterionDocNumber.equals("2")) { | |
printDebug("Criterion:", docId2, "\"" + source2 + "\"", criterionOperator + ".", (criteriaMet ? "Passed!" : "Failed")); | |
} else { | |
printDebug("Criterion:", docId1, "\"" + source1 + "\"", "and/or", docId2, "\"" + source2 + "\"", criterionOperator + ".", (criteriaMet ? "Passed!" : "Failed")); | |
} | |
} | |
// Stop checking criteria if not met | |
if (!criteriaMet) break; | |
} | |
// Move on to next document if criteria not met | |
if (!criteriaMet) { | |
printDebug("Criteria not met"); | |
continue; | |
} | |
// Merge documents if all criteria met | |
printDebug("Criteria met!"); | |
mergeDocuments(doc1, doc2, (firstDoc.equals("2") ? 2 : 1)); | |
docList = XPath.newInstance("//Document").selectNodes(ROOT); | |
i--; | |
} | |
} | |
} | |
private String getDocDetail(Element doc, String detailType, String fieldName, String pageNumber) throws JDOMException, IOException { | |
pageNumber = (pageNumber != null ? pageNumber : "1"); | |
switch (detailType.toLowerCase()) { | |
case "doctype": return doc.getChildText(XMLGenerics.TYPE); | |
case "plf": | |
case "kvpp": return getPlfValue(fieldName, doc); | |
case "dlf": | |
case "kve": return getDlfValue(doc, fieldName); | |
case "file": { | |
Text fileNameText = (Text) XPath.newInstance(".//" + XMLGenerics.PAGES + "/" + XMLGenerics.PAGE + "[position()=" + pageNumber + "]" + "/" + XMLGenerics.OLD_FILE_NAME + "/text()").selectSingleNode(doc); | |
return (fileNameText != null ? fileNameText.getValue().replaceAll("-\\d{4}-\\d{4}\\.[A-z]+$", "") : ""); | |
} | |
case "path": return BATCHUNCPATH_VALUE; | |
case "email": return getEmailHeader(fieldName); | |
default: { | |
Element page = (Element) XPath.newInstance(".//" + XMLGenerics.PAGES + "/" + XMLGenerics.PAGE + "[position()=" + pageNumber + "]").selectSingleNode(doc); | |
return getHocrContent(page); | |
} | |
} | |
} | |
private boolean checkCriteria(String operator, String value, String... sources) { | |
List<String> sourceList = Arrays.asList(sources); | |
switch (operator.toLowerCase()) { | |
case "same": return (sourceList.size() > 1 && sourceList.stream().distinct().count() == 1); | |
case "distance": return (sourceList.size() > 1 && sourceList.stream().allMatch(s -> distance(sourceList.get(0), s) <= Integer.parseInt(value))); | |
case "not-equals": return sourceList.stream().noneMatch(s -> s.equals(value)); | |
case "matches": return sourceList.stream().allMatch(s -> s.matches(value)); | |
case "not-matches": return sourceList.stream().noneMatch(s -> s.matches(value)); | |
case "contains": return sourceList.stream().allMatch(s -> s.contains(value)); | |
case "not-contains": return sourceList.stream().noneMatch(s -> s.contains(value)); | |
case "has-value": return sourceList.stream().noneMatch(String::isEmpty); | |
case "not-has-value": return sourceList.stream().allMatch(String::isEmpty); | |
case "starts-with": return sourceList.stream().allMatch(s -> s.startsWith(value)); | |
case "ends-with": return sourceList.stream().allMatch(s -> s.endsWith(value)); | |
default: return sourceList.stream().allMatch(s -> s.equals(value)); | |
} | |
} | |
@SuppressWarnings("unchecked") | |
private void reassignDocIdentifier() throws JDOMException { | |
printDebug("***** Updating document IDs to be sequential after merging *****"); | |
List<Element> docList = XPath.newInstance("//" + XMLGenerics.DOCUMENT).selectNodes(ROOT); | |
for (int i = 0; i < docList.size(); i++) { | |
docList.get(i).getChild(XMLGenerics.IDENTIFIER).setText("DOC" + (i + 1)); | |
} | |
} | |
// ##### HELPER METHODS ##### | |
private String getEmailHeader(String headerName) throws JDOMException { | |
Element header = (Element) XPath.newInstance("//EmailHeaders/Email/Headers/Header[Name='" + headerName + "']").selectSingleNode(ROOT); | |
if (header == null || header.getChild(XMLGenerics.VALUE) == null) return ""; | |
return getChildText(header, XMLGenerics.VALUE); | |
} | |
private int distance(String a, String b) { | |
a = a.toLowerCase(); | |
b = b.toLowerCase(); | |
int [] costs = new int [b.length() + 1]; | |
for (int j = 0; j < costs.length; j++) | |
costs[j] = j; | |
for (int i = 1; i <= a.length(); i++) { | |
costs[0] = i; | |
int nw = i - 1; | |
for (int j = 1; j <= b.length(); j++) { | |
int cj = Math.min(1 + Math.min(costs[j], costs[j - 1]), a.charAt(i - 1) == b.charAt(j - 1) ? nw : nw + 1); | |
nw = costs[j]; | |
costs[j] = cj; | |
} | |
} | |
return costs[b.length()]; | |
} | |
private String getHocrContent(Element page) throws JDOMException, IOException { | |
if (IS_DEBUG) return "Your HOCR Content HERE. Blah Blah Invoice Number 12345 Blah Blah"; | |
String hocrPath = concatPath(BATCHLOCALPATH_VALUE, BATCHINSTANCEID_VALUE, page.getChildText("HocrFileName")); | |
Element hocrDoc = readXmlFile(hocrPath).getRootElement(); | |
return getChildText(hocrDoc.getChild("HocrPage"), "HocrContent"); | |
} | |
@SuppressWarnings("unchecked") | |
private String getPlfValue(String plfName, Element doc) throws JDOMException { | |
List<Text> values = XPath.newInstance(".//PageLevelField[Name='" + plfName + "']/Value/text()").selectNodes(doc); | |
String returnString = ""; | |
double highestConf = 0.0; | |
for (Text value : values) { | |
Element plfElement = (Element) value.getParent().getParent(); | |
double confidence = Double.parseDouble(plfElement.getChildText(XMLGenerics.CONFIDENCE)); | |
if (confidence > highestConf) { | |
highestConf = confidence; | |
returnString = value.getValue(); | |
} | |
} | |
return returnString; | |
} | |
private Element getDlf(Element doc, String dlfName) throws JDOMException { | |
return (Element) XPath.newInstance(".//" + XMLGenerics.DOCUMENT_LEVEL_FIELD + | |
"[" + XMLGenerics.NAME + "='" + dlfName + "']").selectSingleNode(doc); | |
} | |
private String getDlfValue(Element doc, String dlfName) throws JDOMException { | |
Element dlf = getDlf(doc, dlfName); | |
if (dlf != null && dlf.getChild(XMLGenerics.VALUE) != null) { | |
return dlf.getChildText(XMLGenerics.VALUE); | |
} | |
return ""; | |
} | |
private String getChildText(Element parent, String child) { | |
return (parent.getChild(child) != null ? parent.getChildText(child) : ""); | |
} | |
private void setChildText(Element parent, String childName, String childValue) { | |
Element childElement = parent.getChild(childName); | |
if (childElement == null) { | |
childElement = new Element(childName); | |
parent.addContent(childElement); | |
} | |
childElement.setText(childValue); | |
} | |
private void mergeDocuments(Element doc, Element nextDoc, int firstDoc) throws JDOMException { | |
if (firstDoc == 2) mergeDocuments(nextDoc, doc); | |
else mergeDocuments(doc, nextDoc); | |
} | |
@SuppressWarnings("unchecked") | |
private void mergeDocuments(Element doc, Element nextDoc) throws JDOMException { | |
String docId1 = doc.getChildText(XMLGenerics.IDENTIFIER); | |
String docId2 = nextDoc.getChildText(XMLGenerics.IDENTIFIER); | |
printDebug("Merging", docId1, "and", docId2); | |
// If any index fields are empty for doc1, copy doc2's value (if exists) | |
List<Element> emptyIndexFields = XPath.newInstance(".//" + XMLGenerics.DOCUMENT_LEVEL_FIELD + | |
"[not(Value) or not(Value/text())]").selectNodes(doc); | |
for (Element emptyDlf : emptyIndexFields) { | |
String emptyDlfName = emptyDlf.getChildText(XMLGenerics.NAME); | |
printDebug(docId1, emptyDlfName, "is empty, checking", docId2, "to see if it has a value"); | |
String secondDocValue = getDlfValue(nextDoc, emptyDlfName); | |
if (!secondDocValue.isEmpty()) { | |
printDebug(nextDoc.getChildText(XMLGenerics.IDENTIFIER), emptyDlfName, "has value:", secondDocValue); | |
Element parent = emptyDlf.getParentElement(); | |
int index = parent.indexOf(emptyDlf); | |
parent.addContent(index, (Content) getDlf(nextDoc, emptyDlfName).detach().clone()); | |
emptyDlf.detach(); | |
} | |
} | |
// Copy pages from second doc | |
Element pages = doc.getChild(XMLGenerics.PAGES); | |
List<Element> pageList = nextDoc.getChild(XMLGenerics.PAGES).getChildren(XMLGenerics.PAGE); | |
for (Element page : pageList) { | |
pages.addContent((Content) page.clone()); | |
} | |
nextDoc.detach(); | |
} | |
private String getDocTypeDescription(String docTypeName) { | |
if (IS_DEBUG) return docTypeName; | |
if (DB_CONNECTION == null) DB_CONNECTION = connectToJNDIDBConnection("jdbc/ephesoft"); | |
if (DB_CONNECTION == null) return docTypeName; | |
try (Statement s = DB_CONNECTION.createStatement()) { | |
String query = "SELECT dt.document_type_description FROM document_type dt WHERE dt.document_type_name = '" + docTypeName + "' AND dt.id IN (SELECT bcdt.document_type_id FROM batch_class_document_type bcdt WHERE bcdt.batch_class_id = (SELECT bc.id FROM batch_class bc WHERE bc.identifier = '" + BATCHCLASSID_VALUE + "'))"; | |
ResultSet rs = s.executeQuery(query); | |
if (rs.next()) { | |
return rs.getString(1); | |
} | |
} catch (SQLException e) { | |
LOGGER.error(e.getMessage()); | |
} | |
return docTypeName; | |
} | |
private int getDocTypeConfidenceThreshold(String docTypeName) throws SQLException { | |
if (IS_DEBUG) return 100; | |
if (DB_CONNECTION == null) DB_CONNECTION = connectToJNDIDBConnection("jdbc/ephesoft"); | |
if (DB_CONNECTION == null) return 100; | |
Statement s = null; | |
try { | |
s = DB_CONNECTION.createStatement(); | |
String query = "SELECT dt.min_confidence_threshold FROM document_type dt WHERE dt.document_type_name = '" + docTypeName + "' AND dt.id IN (SELECT bcdt.document_type_id FROM batch_class_document_type bcdt WHERE bcdt.batch_class_id = (SELECT bc.id FROM batch_class bc WHERE bc.identifier = '" + BATCHCLASSID_VALUE + "'))"; | |
ResultSet rs = s.executeQuery(query); | |
if (rs.next()) return rs.getInt(1); | |
} catch (Exception e) { | |
LOGGER.error(e.getMessage()); | |
} finally { | |
try { | |
s.close(); | |
} catch (NullPointerException e) { | |
LOGGER.error(e.getMessage()); | |
} | |
} | |
return 100; | |
} | |
private Connection connectToJNDIDBConnection(final String dataSourceName) { | |
// Attempt to connect to the Ephesoft DB | |
try { | |
LOGGER.info("************ CONNECTING TO JNDI RESOURCE DB: " + dataSourceName); | |
// Obtain our environment naming context | |
Context initCtx = new InitialContext(); | |
Context envCtx = (Context) initCtx.lookup("java:comp/env"); | |
// Look up our data source | |
DataSource ds = (DataSource) envCtx.lookup(dataSourceName); | |
// Allocate and use a connection from the pool | |
Connection conn = ds.getConnection(); | |
LOGGER.info("************ Connected to JNDI resource DB: " + dataSourceName); | |
return conn; | |
} catch (Exception e) { | |
LOGGER.error("************ Error encountered whilst trying to connect to JNDI Resource Connection: " + dataSourceName); | |
return null; | |
} | |
} | |
private boolean fileExists(String path) { | |
File theFile = new File(path); | |
return (theFile.exists() && !theFile.isDirectory()); | |
} | |
private String concatPath(String... strings) { return String.join(File.separator, Arrays.asList(strings)); } | |
private void init(Document documentFile) throws IOException { | |
ROOT = documentFile.getRootElement(); | |
BATCHNAME_VALUE = getChildText(ROOT, XMLGenerics.BATCH_NAME); | |
BATCHLOCALPATH_VALUE = getChildText(ROOT, XMLGenerics.BATCH_LOCAL_PATH); | |
BATCHUNCPATH_VALUE = getChildText(ROOT, XMLGenerics.UNC_FOLDER_PATH); | |
BATCHINSTANCEID_VALUE = getChildText(ROOT, XMLGenerics.BATCH_INSTANCE_ID); | |
BATCHCLASSID_VALUE = getChildText(ROOT, XMLGenerics.BATCH_CLASS_ID); | |
PATH_TO_SCRIPT_CONFIG = concatPath((IS_DEBUG ? concatPath(new File("").getAbsolutePath(), "src") : concatPath(BATCHLOCALPATH_VALUE.replaceFirst("ephesoft-system-folder", ""), BATCHCLASSID_VALUE)), "script-config"); | |
SCRIPT_CONFIG_FILE = concatPath(PATH_TO_SCRIPT_CONFIG, SCRIPT_NAME + ".properties"); | |
if (fileExists(SCRIPT_CONFIG_FILE)) { | |
SCRIPT_PROPS = readKeyWordProperties(SCRIPT_NAME + ".properties"); | |
} else { | |
FileUtils.writeStringToFile(new File(SCRIPT_CONFIG_FILE), "# Properties file to control " + SCRIPT_NAME + " options\r\n", "UTF-8", true); | |
SCRIPT_PROPS = new Properties(); | |
} | |
} | |
private Properties readKeyWordProperties(String fileName) throws IOException { | |
Properties prop = new Properties(); | |
String propFileName = concatPath(PATH_TO_SCRIPT_CONFIG, fileName); | |
String propFileContents = new Scanner(new File(propFileName)).useDelimiter("\\Z").next(); | |
prop.load(new StringReader(propFileContents.replace("\\", "\\\\"))); | |
return prop; | |
} | |
private void appendToPropsFile(File file, String propertyName, String defaultValue) throws IOException { | |
FileUtils.writeStringToFile(file, propertyName + '=' + defaultValue + "\r\n", "UTF-8", true); | |
SCRIPT_PROPS.setProperty(propertyName, defaultValue); | |
} | |
private String getPropValue(String propertyName, String defaultValue) throws IOException { | |
String propValue = SCRIPT_PROPS.getProperty(propertyName); | |
if (propValue != null) { | |
return propValue; | |
} | |
if (defaultValue != null && !defaultValue.isEmpty()) { | |
File f = new File(SCRIPT_CONFIG_FILE); | |
appendToPropsFile(f, propertyName, defaultValue); | |
} | |
return defaultValue; | |
} | |
private static Document readXmlFile(String path) throws JDOMException, IOException { | |
SAXBuilder sb = new SAXBuilder(); | |
return sb.build(path); | |
} | |
private void writeXmlFile(String path, Document doc) throws IOException { | |
XMLOutputter xmlOutput = new XMLOutputter(); | |
xmlOutput.setFormat(Format.getPrettyFormat()); | |
xmlOutput.output(doc, new FileWriter(path)); | |
} | |
private void printDebug(Object... messages) { | |
List<String> stringMessages = new ArrayList<>(); | |
for (Object message : messages) { stringMessages.add(String.valueOf(message)); } | |
String message = String.join(" ", stringMessages); | |
if (IS_DEBUG) { | |
System.out.println(message); | |
} else { | |
LOGGER.info(message); | |
} | |
} | |
public static void main(String... args) { | |
String filePath = System.getProperty("user.home") + "/Downloads/BI1_batch (1).xml"; | |
try { | |
Document doc = readXmlFile(filePath); | |
ScriptDocumentAssembler se = new ScriptDocumentAssembler(); | |
se.execute(doc, null, null); | |
} | |
catch (Exception x) { x.printStackTrace(); } | |
} | |
@SuppressWarnings("unused") | |
private static class XMLGenerics { | |
private static final String BATCH_LOCAL_PATH = "BatchLocalPath"; | |
private static final String BATCH_NAME = "BatchName"; | |
private static final String BATCH_INSTANCE_ID = "BatchInstanceIdentifier"; | |
private static final String BATCH_CLASS_ID = "BatchClassIdentifier"; | |
private static final String UNC_FOLDER_PATH = "UNCFolderPath"; | |
private static final String PAGES = "Pages"; | |
private static final String PAGE = "Page"; | |
private static final String DOCUMENTS = "Documents"; | |
private static final String DOCUMENT = "Document"; | |
private static final String NAME = "Name"; | |
private static final String TYPE = "Type"; | |
private static final String VALUE = "Value"; | |
private static final String VALID = "Valid"; | |
private static final String DESCRIPTION = "Description"; | |
private static final String IDENTIFIER = "Identifier"; | |
private static final String DOCUMENT_LEVEL_FIELDS = "DocumentLevelFields"; | |
private static final String DOCUMENT_LEVEL_FIELD = "DocumentLevelField"; | |
private static final String PAGE_LEVEL_FIELDS = "PageLevelFields"; | |
private static final String PAGE_LEVEL_FIELD = "PageLevelField"; | |
private static final String ALTERNATE_VALUES = "AlternateValues"; | |
private static final String ALTERNATE_VALUE = "AlternateValue"; | |
private static final String FIELD_VALUE_OPTION_LIST = "FieldValueOptionList"; | |
private static final String OLD_FILE_NAME = "OldFileName"; | |
private static final String CONFIDENCE = "Confidence"; | |
private static final String CONFIDENCE_THRESHOLD = "ConfidenceThreshold"; | |
private static final String OCR_CONFIDENCE = "OcrConfidence"; | |
private static final String OCR_CONFIDENCE_THRESHOLD = "OcrConfidenceThreshold"; | |
private static final String FORCE_REVIEW = "ForceReview"; | |
private static final String REVIEWED = "Reviewed"; | |
private static final String COORDINATES_LIST = "CoordinatesList"; | |
private static final String BATCH_LEVEL_FIELDS = "BatchLevelFields"; | |
private static final String BATCH_LEVEL_FIELD = "BatchLevelField"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment