Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Extract Text and Metadata from PDFs with NiFi's ExecuteScript processor (and Groovy)
<?xml version="1.0" encoding="UTF-8" standalone="yes"?><template><description>This template extracts text and metadata from PDF files using a Groovy script</description><name>ExtractTextFromPDFWithScript</name><snippet><connections><id>e749c17c-57ae-4f90-b741-37b9ce2fef70</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>96e61f17-5b0c-4746-a221-daf286bc367f</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>success</selectedRelationships><source><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>9d2d13b7-0589-4b1d-8ca5-60f590ad8958</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>93f9f218-4ec6-4aba-90f5-f5bca4ff9d4c</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>429b3019-66fa-49db-8486-cb97d775b6d5</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>failure</selectedRelationships><source><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>9d2d13b7-0589-4b1d-8ca5-60f590ad8958</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>95aa1765-a306-4e9e-8bc3-d0077a51c047</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>9d2d13b7-0589-4b1d-8ca5-60f590ad8958</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>success</selectedRelationships><source><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>e4c5bf97-e1da-47f8-9ec5-dfaa8a4a1a1a</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>0fa75062-9e54-4b54-a1b1-afe5588e1673</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>b3be39b2-7b28-48cb-b225-455cdbbf61c2</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>success</selectedRelationships><source><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>9d2d13b7-0589-4b1d-8ca5-60f590ad8958</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><connections><id>2c05640f-abcd-42bf-b67a-f22af85482be</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><backPressureDataSizeThreshold>0 MB</backPressureDataSizeThreshold><backPressureObjectThreshold>0</backPressureObjectThreshold><destination><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>429b3019-66fa-49db-8486-cb97d775b6d5</id><type>PROCESSOR</type></destination><flowFileExpiration>0 sec</flowFileExpiration><labelIndex>1</labelIndex><name></name><selectedRelationships>failure</selectedRelationships><source><groupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</groupId><id>b3be39b2-7b28-48cb-b225-455cdbbf61c2</id><type>PROCESSOR</type></source><zIndex>0</zIndex></connections><processors><id>b3be39b2-7b28-48cb-b225-455cdbbf61c2</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><position><x>1400.232177734375</x><y>522.232177734375</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Directory</key><value><description>The directory to which files should be written. You may use expression language such as /aa/bb/${path}</description><displayName>Directory</displayName><dynamic>false</dynamic><name>Directory</name><required>true</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Conflict Resolution Strategy</key><value><allowableValues><displayName>replace</displayName><value>replace</value></allowableValues><allowableValues><displayName>ignore</displayName><value>ignore</value></allowableValues><allowableValues><displayName>fail</displayName><value>fail</value></allowableValues><defaultValue>fail</defaultValue><description>Indicates what should happen when a file with the same name already exists in the output directory</description><displayName>Conflict Resolution Strategy</displayName><dynamic>false</dynamic><name>Conflict Resolution Strategy</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Create Missing Directories</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>true</defaultValue><description>If true, then missing destination directories will be created. If false, flowfiles are penalized and sent to failure.</description><displayName>Create Missing Directories</displayName><dynamic>false</dynamic><name>Create Missing Directories</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Maximum File Count</key><value><description>Specifies the maximum number of files that can exist in the output directory</description><displayName>Maximum File Count</displayName><dynamic>false</dynamic><name>Maximum File Count</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Last Modified Time</key><value><description>Sets the lastModifiedTime on the output file to the value of this attribute. Format must be yyyy-MM-dd'T'HH:mm:ssZ. You may also use expression language such as ${file.lastModifiedTime}.</description><displayName>Last Modified Time</displayName><dynamic>false</dynamic><name>Last Modified Time</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Permissions</key><value><description>Sets the permissions on the output file to the value of this attribute. Format must be either UNIX rwxrwxrwx with a - in place of denied permissions (e.g. rw-r--r--) or an octal number (e.g. 644). You may also use expression language such as ${file.permissions}.</description><displayName>Permissions</displayName><dynamic>false</dynamic><name>Permissions</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Owner</key><value><description>Sets the owner on the output file to the value of this attribute. You may also use expression language such as ${file.owner}.</description><displayName>Owner</displayName><dynamic>false</dynamic><name>Owner</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Group</key><value><description>Sets the group on the output file to the value of this attribute. You may also use expression language such as ${file.group}.</description><displayName>Group</displayName><dynamic>false</dynamic><name>Group</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Directory</key><value>/Users/mburgess/tmp</value></entry><entry><key>Conflict Resolution Strategy</key><value>replace</value></entry><entry><key>Create Missing Directories</key></entry><entry><key>Maximum File Count</key></entry><entry><key>Last Modified Time</key></entry><entry><key>Permissions</key></entry><entry><key>Owner</key></entry><entry><key>Group</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>Write PDF Text to File</name><relationships><autoTerminate>false</autoTerminate><description>Files that could not be written to the output directory for some reason are transferred to this relationship</description><name>failure</name></relationships><relationships><autoTerminate>true</autoTerminate><description>Files that have been successfully written to the output directory are transferred to this relationship</description><name>success</name></relationships><state>STOPPED</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.standard.PutFile</type></processors><processors><id>9d2d13b7-0589-4b1d-8ca5-60f590ad8958</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><position><x>914.3751220703125</x><y>517.232177734375</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Script Engine</key><value><allowableValues><displayName>ECMAScript</displayName><value>ECMAScript</value></allowableValues><allowableValues><displayName>Groovy</displayName><value>Groovy</value></allowableValues><allowableValues><displayName>lua</displayName><value>lua</value></allowableValues><allowableValues><displayName>python</displayName><value>python</value></allowableValues><allowableValues><displayName>ruby</displayName><value>ruby</value></allowableValues><defaultValue>ECMAScript</defaultValue><description>The engine to execute scripts</description><displayName>Script Engine</displayName><dynamic>false</dynamic><name>Script Engine</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Script File</key><value><description>Path to script file to execute. Only one of Script File or Script Body may be used</description><displayName>Script File</displayName><dynamic>false</dynamic><name>Script File</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>Script Body</key><value><description>Body of script to execute. Only one of Script File or Script Body may be used</description><displayName>Script Body</displayName><dynamic>false</dynamic><name>Script Body</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Module Directory</key><value><description>Comma-separated list of paths to files and/or directories which contain modules required by the script.</description><displayName>Module Directory</displayName><dynamic>false</dynamic><name>Module Directory</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Script Engine</key><value>Groovy</value></entry><entry><key>Script File</key></entry><entry><key>Script Body</key><value>import org.apache.pdfbox.pdmodel.*
import org.apache.pdfbox.util.*
def flowFile = session.get()
if(!flowFile) return
def s = new PDFTextStripper()
def doc, info
flowFile = session.write(flowFile, {inputStream, outputStream -&gt;
doc = PDDocument.load(inputStream)
info = doc.getDocumentInformation()
s.writeText(doc, new OutputStreamWriter(outputStream))
} as StreamCallback
)
flowFile = session.putAttribute(flowFile, 'pdf.page.count', &quot;${doc.getNumberOfPages()}&quot;)
flowFile = session.putAttribute(flowFile, 'pdf.title', &quot;${info.getTitle()}&quot; )
flowFile = session.putAttribute(flowFile, 'pdf.author',&quot;${info.getAuthor()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.subject', &quot;${info.getSubject()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.keywords', &quot;${info.getKeywords()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.creator', &quot;${info.getCreator()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.producer', &quot;${info.getProducer()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.date.creation', &quot;${info.getCreationDate()}&quot; );
flowFile = session.putAttribute(flowFile, 'pdf.date.modified', &quot;${info.getModificationDate()}&quot;);
flowFile = session.putAttribute(flowFile, 'pdf.trapped', &quot;${info.getTrapped()}&quot; );
session.transfer(flowFile, REL_SUCCESS)</value></entry><entry><key>Module Directory</key><value>/Users/mburgess/Downloads/pdfbox</value></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>Extract Text &amp; Metadata from PDF</name><relationships><autoTerminate>false</autoTerminate><description>FlowFiles that failed to be processed</description><name>failure</name></relationships><relationships><autoTerminate>false</autoTerminate><description>FlowFiles that were successfully processed</description><name>success</name></relationships><state>STOPPED</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>false</supportsParallelProcessing><type>org.apache.nifi.processors.script.ExecuteScript</type></processors><processors><id>96e61f17-5b0c-4746-a221-daf286bc367f</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><position><x>1301.0876349249268</x><y>334.94251929651693</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Log Level</key><value><allowableValues><displayName>trace</displayName><value>trace</value></allowableValues><allowableValues><displayName>debug</displayName><value>debug</value></allowableValues><allowableValues><displayName>info</displayName><value>info</value></allowableValues><allowableValues><displayName>warn</displayName><value>warn</value></allowableValues><allowableValues><displayName>error</displayName><value>error</value></allowableValues><defaultValue>info</defaultValue><description>The Log Level to use when logging the Attributes</description><displayName>Log Level</displayName><dynamic>false</dynamic><name>Log Level</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Log Payload</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>false</defaultValue><description>If true, the FlowFile's payload will be logged, in addition to its attributes; otherwise, just the Attributes will be logged.</description><displayName>Log Payload</displayName><dynamic>false</dynamic><name>Log Payload</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Attributes to Log</key><value><description>A comma-separated list of Attributes to Log. If not specified, all attributes will be logged.</description><displayName>Attributes to Log</displayName><dynamic>false</dynamic><name>Attributes to Log</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Attributes to Ignore</key><value><description>A comma-separated list of Attributes to ignore. If not specified, no attributes will be ignored.</description><displayName>Attributes to Ignore</displayName><dynamic>false</dynamic><name>Attributes to Ignore</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Log prefix</key><value><description>Log prefix appended to the log lines. It helps to distinguish the output of multiple LogAttribute processors.</description><displayName>Log prefix</displayName><dynamic>false</dynamic><name>Log prefix</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Log Level</key></entry><entry><key>Log Payload</key></entry><entry><key>Attributes to Log</key><value>pdf.page.count,pdf.title,pdf.subject,pdf.keywords,pdf.creator,pdf.producer,pdf.date.creation,pdf.date.modified,pdf.trapped</value></entry><entry><key>Attributes to Ignore</key></entry><entry><key>Log prefix</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>LogAttribute</name><relationships><autoTerminate>true</autoTerminate><description>All FlowFiles are routed to this relationship</description><name>success</name></relationships><state>STOPPED</state><style/><supportsEventDriven>true</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.standard.LogAttribute</type></processors><processors><id>429b3019-66fa-49db-8486-cb97d775b6d5</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><position><x>1141.3285089128383</x><y>741.0675080376811</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Log Level</key><value><allowableValues><displayName>trace</displayName><value>trace</value></allowableValues><allowableValues><displayName>debug</displayName><value>debug</value></allowableValues><allowableValues><displayName>info</displayName><value>info</value></allowableValues><allowableValues><displayName>warn</displayName><value>warn</value></allowableValues><allowableValues><displayName>error</displayName><value>error</value></allowableValues><defaultValue>info</defaultValue><description>The Log Level to use when logging the Attributes</description><displayName>Log Level</displayName><dynamic>false</dynamic><name>Log Level</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Log Payload</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>false</defaultValue><description>If true, the FlowFile's payload will be logged, in addition to its attributes; otherwise, just the Attributes will be logged.</description><displayName>Log Payload</displayName><dynamic>false</dynamic><name>Log Payload</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Attributes to Log</key><value><description>A comma-separated list of Attributes to Log. If not specified, all attributes will be logged.</description><displayName>Attributes to Log</displayName><dynamic>false</dynamic><name>Attributes to Log</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Attributes to Ignore</key><value><description>A comma-separated list of Attributes to ignore. If not specified, no attributes will be ignored.</description><displayName>Attributes to Ignore</displayName><dynamic>false</dynamic><name>Attributes to Ignore</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Log prefix</key><value><description>Log prefix appended to the log lines. It helps to distinguish the output of multiple LogAttribute processors.</description><displayName>Log prefix</displayName><dynamic>false</dynamic><name>Log prefix</name><required>false</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Log Level</key></entry><entry><key>Log Payload</key></entry><entry><key>Attributes to Log</key><value>filename</value></entry><entry><key>Attributes to Ignore</key></entry><entry><key>Log prefix</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>0 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>LogFailedFilenames</name><relationships><autoTerminate>true</autoTerminate><description>All FlowFiles are routed to this relationship</description><name>success</name></relationships><state>STOPPED</state><style/><supportsEventDriven>true</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.standard.LogAttribute</type></processors><processors><id>e4c5bf97-e1da-47f8-9ec5-dfaa8a4a1a1a</id><parentGroupId>f2cfcff1-1730-4250-9535-f84a5af7d2a4</parentGroupId><position><x>390.0</x><y>521.0</y></position><config><bulletinLevel>WARN</bulletinLevel><comments></comments><concurrentlySchedulableTaskCount>1</concurrentlySchedulableTaskCount><defaultConcurrentTasks><entry><key>TIMER_DRIVEN</key><value>1</value></entry><entry><key>EVENT_DRIVEN</key><value>0</value></entry><entry><key>CRON_DRIVEN</key><value>1</value></entry></defaultConcurrentTasks><defaultSchedulingPeriod><entry><key>TIMER_DRIVEN</key><value>0 sec</value></entry><entry><key>CRON_DRIVEN</key><value>* * * * * ?</value></entry></defaultSchedulingPeriod><descriptors><entry><key>Input Directory</key><value><description>The input directory from which to pull files</description><displayName>Input Directory</displayName><dynamic>false</dynamic><name>Input Directory</name><required>true</required><sensitive>false</sensitive><supportsEl>true</supportsEl></value></entry><entry><key>File Filter</key><value><defaultValue>[^\.].*</defaultValue><description>Only files whose names match the given regular expression will be picked up</description><displayName>File Filter</displayName><dynamic>false</dynamic><name>File Filter</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Path Filter</key><value><description>When Recurse Subdirectories is true, then only subdirectories whose path matches the given regular expression will be scanned</description><displayName>Path Filter</displayName><dynamic>false</dynamic><name>Path Filter</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Batch Size</key><value><defaultValue>10</defaultValue><description>The maximum number of files to pull in each iteration</description><displayName>Batch Size</displayName><dynamic>false</dynamic><name>Batch Size</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Keep Source File</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>false</defaultValue><description>If true, the file is not deleted after it has been copied to the Content Repository; this causes the file to be picked up continually and is useful for testing purposes. If not keeping original NiFi will need write permissions on the directory it is pulling from otherwise it will ignore the file.</description><displayName>Keep Source File</displayName><dynamic>false</dynamic><name>Keep Source File</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Recurse Subdirectories</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>true</defaultValue><description>Indicates whether or not to pull files from subdirectories</description><displayName>Recurse Subdirectories</displayName><dynamic>false</dynamic><name>Recurse Subdirectories</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Polling Interval</key><value><defaultValue>0 sec</defaultValue><description>Indicates how long to wait before performing a directory listing</description><displayName>Polling Interval</displayName><dynamic>false</dynamic><name>Polling Interval</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Ignore Hidden Files</key><value><allowableValues><displayName>true</displayName><value>true</value></allowableValues><allowableValues><displayName>false</displayName><value>false</value></allowableValues><defaultValue>true</defaultValue><description>Indicates whether or not hidden files should be ignored</description><displayName>Ignore Hidden Files</displayName><dynamic>false</dynamic><name>Ignore Hidden Files</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Minimum File Age</key><value><defaultValue>0 sec</defaultValue><description>The minimum age that a file must be in order to be pulled; any file younger than this amount of time (according to last modification date) will be ignored</description><displayName>Minimum File Age</displayName><dynamic>false</dynamic><name>Minimum File Age</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Maximum File Age</key><value><description>The maximum age that a file must be in order to be pulled; any file older than this amount of time (according to last modification date) will be ignored</description><displayName>Maximum File Age</displayName><dynamic>false</dynamic><name>Maximum File Age</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Minimum File Size</key><value><defaultValue>0 B</defaultValue><description>The minimum size that a file must be in order to be pulled</description><displayName>Minimum File Size</displayName><dynamic>false</dynamic><name>Minimum File Size</name><required>true</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry><entry><key>Maximum File Size</key><value><description>The maximum size that a file can be in order to be pulled</description><displayName>Maximum File Size</displayName><dynamic>false</dynamic><name>Maximum File Size</name><required>false</required><sensitive>false</sensitive><supportsEl>false</supportsEl></value></entry></descriptors><lossTolerant>false</lossTolerant><penaltyDuration>30 sec</penaltyDuration><properties><entry><key>Input Directory</key><value>/Users/mburgess/Documents</value></entry><entry><key>File Filter</key><value>[^\.].*\.pdf</value></entry><entry><key>Path Filter</key></entry><entry><key>Batch Size</key><value>100</value></entry><entry><key>Keep Source File</key><value>true</value></entry><entry><key>Recurse Subdirectories</key></entry><entry><key>Polling Interval</key></entry><entry><key>Ignore Hidden Files</key></entry><entry><key>Minimum File Age</key></entry><entry><key>Maximum File Age</key></entry><entry><key>Minimum File Size</key></entry><entry><key>Maximum File Size</key></entry></properties><runDurationMillis>0</runDurationMillis><schedulingPeriod>5 sec</schedulingPeriod><schedulingStrategy>TIMER_DRIVEN</schedulingStrategy><yieldDuration>1 sec</yieldDuration></config><name>Get PDFs</name><relationships><autoTerminate>false</autoTerminate><description>All files are routed to success</description><name>success</name></relationships><state>STOPPED</state><style/><supportsEventDriven>false</supportsEventDriven><supportsParallelProcessing>true</supportsParallelProcessing><type>org.apache.nifi.processors.standard.GetFile</type></processors></snippet><timestamp>02/19/2016 20:19:40 EST</timestamp></template>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment