Skip to content

Instantly share code, notes, and snippets.

@mattyb149
Created January 14, 2015 19:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mattyb149/82727922cbfa4ddfff22 to your computer and use it in GitHub Desktop.
Save mattyb149/82727922cbfa4ddfff22 to your computer and use it in GitHub Desktop.
Calculate rolling distinct count based on date in PDI
<?xml version="1.0" encoding="UTF-8"?>
<transformation>
<info>
<name>rolling_distinct</name>
<description/>
<extended_description/>
<trans_version/>
<trans_type>Normal</trans_type>
<directory>&#x2f;</directory>
<parameters>
</parameters>
<log>
<trans-log-table><connection/>
<schema/>
<table/>
<size_limit_lines/>
<interval/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STATUS</id><enabled>Y</enabled><name>STATUS</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name><subject/></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name><subject/></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name><subject/></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name><subject/></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name><subject/></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name><subject/></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>STARTDATE</id><enabled>Y</enabled><name>STARTDATE</name></field><field><id>ENDDATE</id><enabled>Y</enabled><name>ENDDATE</name></field><field><id>LOGDATE</id><enabled>Y</enabled><name>LOGDATE</name></field><field><id>DEPDATE</id><enabled>Y</enabled><name>DEPDATE</name></field><field><id>REPLAYDATE</id><enabled>Y</enabled><name>REPLAYDATE</name></field><field><id>LOG_FIELD</id><enabled>Y</enabled><name>LOG_FIELD</name></field><field><id>EXECUTING_SERVER</id><enabled>N</enabled><name>EXECUTING_SERVER</name></field><field><id>EXECUTING_USER</id><enabled>N</enabled><name>EXECUTING_USER</name></field><field><id>CLIENT</id><enabled>N</enabled><name>CLIENT</name></field></trans-log-table>
<perf-log-table><connection/>
<schema/>
<table/>
<interval/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>SEQ_NR</id><enabled>Y</enabled><name>SEQ_NR</name></field><field><id>LOGDATE</id><enabled>Y</enabled><name>LOGDATE</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STEPNAME</id><enabled>Y</enabled><name>STEPNAME</name></field><field><id>STEP_COPY</id><enabled>Y</enabled><name>STEP_COPY</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>INPUT_BUFFER_ROWS</id><enabled>Y</enabled><name>INPUT_BUFFER_ROWS</name></field><field><id>OUTPUT_BUFFER_ROWS</id><enabled>Y</enabled><name>OUTPUT_BUFFER_ROWS</name></field></perf-log-table>
<channel-log-table><connection/>
<schema/>
<table/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>LOG_DATE</id><enabled>Y</enabled><name>LOG_DATE</name></field><field><id>LOGGING_OBJECT_TYPE</id><enabled>Y</enabled><name>LOGGING_OBJECT_TYPE</name></field><field><id>OBJECT_NAME</id><enabled>Y</enabled><name>OBJECT_NAME</name></field><field><id>OBJECT_COPY</id><enabled>Y</enabled><name>OBJECT_COPY</name></field><field><id>REPOSITORY_DIRECTORY</id><enabled>Y</enabled><name>REPOSITORY_DIRECTORY</name></field><field><id>FILENAME</id><enabled>Y</enabled><name>FILENAME</name></field><field><id>OBJECT_ID</id><enabled>Y</enabled><name>OBJECT_ID</name></field><field><id>OBJECT_REVISION</id><enabled>Y</enabled><name>OBJECT_REVISION</name></field><field><id>PARENT_CHANNEL_ID</id><enabled>Y</enabled><name>PARENT_CHANNEL_ID</name></field><field><id>ROOT_CHANNEL_ID</id><enabled>Y</enabled><name>ROOT_CHANNEL_ID</name></field></channel-log-table>
<step-log-table><connection/>
<schema/>
<table/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>LOG_DATE</id><enabled>Y</enabled><name>LOG_DATE</name></field><field><id>TRANSNAME</id><enabled>Y</enabled><name>TRANSNAME</name></field><field><id>STEPNAME</id><enabled>Y</enabled><name>STEPNAME</name></field><field><id>STEP_COPY</id><enabled>Y</enabled><name>STEP_COPY</name></field><field><id>LINES_READ</id><enabled>Y</enabled><name>LINES_READ</name></field><field><id>LINES_WRITTEN</id><enabled>Y</enabled><name>LINES_WRITTEN</name></field><field><id>LINES_UPDATED</id><enabled>Y</enabled><name>LINES_UPDATED</name></field><field><id>LINES_INPUT</id><enabled>Y</enabled><name>LINES_INPUT</name></field><field><id>LINES_OUTPUT</id><enabled>Y</enabled><name>LINES_OUTPUT</name></field><field><id>LINES_REJECTED</id><enabled>Y</enabled><name>LINES_REJECTED</name></field><field><id>ERRORS</id><enabled>Y</enabled><name>ERRORS</name></field><field><id>LOG_FIELD</id><enabled>N</enabled><name>LOG_FIELD</name></field></step-log-table>
<metrics-log-table><connection/>
<schema/>
<table/>
<timeout_days/>
<field><id>ID_BATCH</id><enabled>Y</enabled><name>ID_BATCH</name></field><field><id>CHANNEL_ID</id><enabled>Y</enabled><name>CHANNEL_ID</name></field><field><id>LOG_DATE</id><enabled>Y</enabled><name>LOG_DATE</name></field><field><id>METRICS_DATE</id><enabled>Y</enabled><name>METRICS_DATE</name></field><field><id>METRICS_CODE</id><enabled>Y</enabled><name>METRICS_CODE</name></field><field><id>METRICS_DESCRIPTION</id><enabled>Y</enabled><name>METRICS_DESCRIPTION</name></field><field><id>METRICS_SUBJECT</id><enabled>Y</enabled><name>METRICS_SUBJECT</name></field><field><id>METRICS_TYPE</id><enabled>Y</enabled><name>METRICS_TYPE</name></field><field><id>METRICS_VALUE</id><enabled>Y</enabled><name>METRICS_VALUE</name></field></metrics-log-table>
</log>
<maxdate>
<connection/>
<table/>
<field/>
<offset>0.0</offset>
<maxdiff>0.0</maxdiff>
</maxdate>
<size_rowset>10000</size_rowset>
<sleep_time_empty>50</sleep_time_empty>
<sleep_time_full>50</sleep_time_full>
<unique_connections>N</unique_connections>
<feedback_shown>Y</feedback_shown>
<feedback_size>50000</feedback_size>
<using_thread_priorities>Y</using_thread_priorities>
<shared_objects_file/>
<capture_step_performance>N</capture_step_performance>
<step_performance_capturing_delay>1000</step_performance_capturing_delay>
<step_performance_capturing_size_limit>100</step_performance_capturing_size_limit>
<dependencies>
</dependencies>
<partitionschemas>
</partitionschemas>
<slaveservers>
</slaveservers>
<clusterschemas>
</clusterschemas>
<created_user>-</created_user>
<created_date>2015&#x2f;01&#x2f;14 13&#x3a;55&#x3a;37.459</created_date>
<modified_user>-</modified_user>
<modified_date>2015&#x2f;01&#x2f;14 13&#x3a;55&#x3a;37.459</modified_date>
</info>
<notepads>
</notepads>
<order>
<hop> <from>Sort Field values</from><to>Identify repeating values</to><enabled>Y</enabled> </hop>
<hop> <from>Identify repeating values</from><to>Map &#x3e;1 to 0</to><enabled>Y</enabled> </hop>
<hop> <from>Sort by Year&#x2f;Month</from><to>Sum by month</to><enabled>Y</enabled> </hop>
<hop> <from>Data Grid</from><to>Get Year and Month</to><enabled>Y</enabled> </hop>
<hop> <from>Get Year and Month</from><to>Sort Field values</to><enabled>Y</enabled> </hop>
<hop> <from>Sum by month</from><to>Rolling Distinct Count</to><enabled>Y</enabled> </hop>
<hop> <from>Map &#x3e;1 to 0</from><to>Cast to Int</to><enabled>Y</enabled> </hop>
<hop> <from>Cast to Int</from><to>Sort by Year&#x2f;Month</to><enabled>Y</enabled> </hop>
<hop> <from>Rolling Distinct Count</from><to>Output</to><enabled>Y</enabled> </hop>
</order>
<step>
<name>Data Grid</name>
<type>DataGrid</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields>
<field>
<name>Date</name>
<type>Date</type>
<format>yyyy-MM-dd</format>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
<field>
<name>Field</name>
<type>String</type>
<format/>
<currency/>
<decimal/>
<group/>
<length>-1</length>
<precision>-1</precision>
<set_empty_string>N</set_empty_string>
</field>
</fields>
<data>
<line> <item>2013-01-01</item><item>A</item> </line>
<line> <item>2013-02-05</item><item>B</item> </line>
<line> <item>2013-02-06</item><item>A</item> </line>
<line> <item>2013-02-07</item><item>A</item> </line>
<line> <item>2013-03-02</item><item>C</item> </line>
<line> <item>2013-04-03</item><item>B</item> </line>
</data>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>61</xloc>
<yloc>22</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Sort Field values</name>
<type>SortRows</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<directory>&#x25;&#x25;java.io.tmpdir&#x25;&#x25;</directory>
<prefix>out</prefix>
<sort_size>1000000</sort_size>
<free_memory/>
<compress>N</compress>
<compress_variable/>
<unique_rows>N</unique_rows>
<fields>
<field>
<name>Field</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<presorted>N</presorted>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>331</xloc>
<yloc>23</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Identify repeating values</name>
<type>FieldsChangeSequence</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<start>1</start>
<increment>1</increment>
<resultfieldName>occurrence</resultfieldName>
<fields>
<field>
<name>Field</name>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>64</xloc>
<yloc>144</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Map &#x3e;1 to 0</name>
<type>NumberRange</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<inputField>occurrence</inputField>
<outputField>first_occurrence</outputField>
<fallBackValue>unknown</fallBackValue>
<rules>
<rule>
<lower_bound>-1.7976931348623157E308</lower_bound>
<upper_bound>1.01</upper_bound>
<value>1</value>
</rule>
<rule>
<lower_bound>1.01</lower_bound>
<upper_bound>1.7976931348623157E308</upper_bound>
<value>0</value>
</rule>
</rules>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>210</xloc>
<yloc>144</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Sum by month</name>
<type>GroupBy</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<all_rows>N</all_rows>
<ignore_aggregate>N</ignore_aggregate>
<field_ignore/>
<directory>&#x25;&#x25;java.io.tmpdir&#x25;&#x25;</directory>
<prefix>grp</prefix>
<add_linenr>N</add_linenr>
<linenr_fieldname/>
<give_back_row>N</give_back_row>
<group>
<field>
<name>year</name>
</field>
<field>
<name>month</name>
</field>
</group>
<fields>
<field>
<aggregate>occur_per_month</aggregate>
<subject>first_occurrence</subject>
<type>SUM</type>
<valuefield/>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>198</xloc>
<yloc>263</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Sort by Year&#x2f;Month</name>
<type>SortRows</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<directory>&#x25;&#x25;java.io.tmpdir&#x25;&#x25;</directory>
<prefix>out</prefix>
<sort_size>1000000</sort_size>
<free_memory/>
<compress>N</compress>
<compress_variable/>
<unique_rows>N</unique_rows>
<fields>
<field>
<name>year</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<presorted>N</presorted>
</field>
<field>
<name>month</name>
<ascending>Y</ascending>
<case_sensitive>N</case_sensitive>
<presorted>N</presorted>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>56</xloc>
<yloc>264</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Get Year and Month</name>
<type>Calculator</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<calculation><field_name>year</field_name>
<calc_type>YEAR_OF_DATE</calc_type>
<field_a>Date</field_a>
<field_b/>
<field_c/>
<value_type>Integer</value_type>
<value_length>-1</value_length>
<value_precision>-1</value_precision>
<remove>N</remove>
<conversion_mask/>
<decimal_symbol/>
<grouping_symbol/>
<currency_symbol/>
</calculation>
<calculation><field_name>month</field_name>
<calc_type>MONTH_OF_DATE</calc_type>
<field_a>Date</field_a>
<field_b/>
<field_c/>
<value_type>Integer</value_type>
<value_length>-1</value_length>
<value_precision>-1</value_precision>
<remove>N</remove>
<conversion_mask/>
<decimal_symbol/>
<grouping_symbol/>
<currency_symbol/>
</calculation>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>205</xloc>
<yloc>23</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Rolling Distinct Count</name>
<type>GroupBy</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<all_rows>Y</all_rows>
<ignore_aggregate>N</ignore_aggregate>
<field_ignore/>
<directory>&#x25;&#x25;java.io.tmpdir&#x25;&#x25;</directory>
<prefix>grp</prefix>
<add_linenr>N</add_linenr>
<linenr_fieldname/>
<give_back_row>N</give_back_row>
<group>
</group>
<fields>
<field>
<aggregate>rolling_distinct</aggregate>
<subject>occur_per_month</subject>
<type>CUM_SUM</type>
<valuefield/>
</field>
</fields>
<cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>347</xloc>
<yloc>262</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Cast to Int</name>
<type>SelectValues</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields> <field> <name>year</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <field> <name>month</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <field> <name>first_occurrence</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <select_unspecified>N</select_unspecified>
<meta> <name>first_occurrence</name>
<rename>first_occurrence</rename>
<type>Integer</type>
<length>-2</length>
<precision>-2</precision>
<conversion_mask>&#x23;</conversion_mask>
<date_format_lenient>false</date_format_lenient>
<date_format_locale/>
<date_format_timezone/>
<lenient_string_to_number>false</lenient_string_to_number>
<encoding/>
<decimal_symbol/>
<grouping_symbol/>
<currency_symbol/>
<storage_type/>
</meta> </fields> <cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>349</xloc>
<yloc>144</yloc>
<draw>Y</draw>
</GUI>
</step>
<step>
<name>Output</name>
<type>SelectValues</type>
<description/>
<distribute>Y</distribute>
<custom_distribution/>
<copies>1</copies>
<partitioning>
<method>none</method>
<schema_name/>
</partitioning>
<fields> <field> <name>year</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <field> <name>month</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <field> <name>rolling_distinct</name>
<rename/>
<length>-2</length>
<precision>-2</precision>
</field> <select_unspecified>N</select_unspecified>
</fields> <cluster_schema/>
<remotesteps> <input> </input> <output> </output> </remotesteps> <GUI>
<xloc>202</xloc>
<yloc>347</yloc>
<draw>Y</draw>
</GUI>
</step>
<step_error_handling>
</step_error_handling>
<slave-step-copy-partition-distribution>
</slave-step-copy-partition-distribution>
<slave_transformation>N</slave_transformation>
</transformation>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment