Skip to content

Instantly share code, notes, and snippets.

@metadaddy
Last active April 11, 2017 23:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save metadaddy/9dff892eafb7b498696d85fb0f061ccd to your computer and use it in GitHub Desktop.
Save metadaddy/9dff892eafb7b498696d85fb0f061ccd to your computer and use it in GitHub Desktop.
Manipulate fields in StreamSets Data Collector - download and import into SDC 2.4.0.0 or above!
{
"pipelineConfig" : {
"schemaVersion" : 2,
"version" : 5,
"uuid" : "e027d7d1-12db-4d9a-8f9a-827937c7c4df",
"title" : "Field Manipulations",
"description" : "",
"configuration" : [ {
"name" : "executionMode",
"value" : "STANDALONE"
}, {
"name" : "deliveryGuarantee",
"value" : "AT_LEAST_ONCE"
}, {
"name" : "shouldRetry",
"value" : true
}, {
"name" : "retryAttempts",
"value" : -1
}, {
"name" : "memoryLimit",
"value" : "${jvm:maxMemoryMB() * 0.65}"
}, {
"name" : "memoryLimitExceeded",
"value" : "STOP_PIPELINE"
}, {
"name" : "notifyOnStates",
"value" : [ "RUN_ERROR", "STOPPED", "FINISHED" ]
}, {
"name" : "emailIDs",
"value" : [ ]
}, {
"name" : "constants",
"value" : [ ]
}, {
"name" : "badRecordsHandling",
"value" : "streamsets-datacollector-basic-lib::com_streamsets_pipeline_stage_destination_devnull_ToErrorNullDTarget::1"
}, {
"name" : "clusterSlaveMemory",
"value" : 1024
}, {
"name" : "clusterSlaveJavaOpts",
"value" : "-XX:PermSize=128M -XX:MaxPermSize=256M -Dhttps.protocols=TLSv1.2,TLSv1.1 -Dlog4j.debug"
}, {
"name" : "clusterLauncherEnv",
"value" : [ ]
}, {
"name" : "mesosDispatcherURL",
"value" : null
}, {
"name" : "hdfsS3ConfDir",
"value" : null
}, {
"name" : "rateLimit",
"value" : 0
}, {
"name" : "statsAggregatorStage",
"value" : ""
} ],
"uiInfo" : {
"previewConfig" : {
"previewSource" : "CONFIGURED_SOURCE",
"batchSize" : 10,
"timeout" : 10000,
"writeToDestinations" : false,
"showHeader" : false,
"showFieldType" : true,
"rememberMe" : false
}
},
"stages" : [ {
"instanceName" : "DevRawDataSource_01",
"library" : "streamsets-datacollector-dev-lib",
"stageName" : "com_streamsets_pipeline_stage_devtest_rawdata_RawDataDSource",
"stageVersion" : "2",
"configuration" : [ {
"name" : "dataFormat",
"value" : "JSON"
}, {
"name" : "dataFormatConfig.compression",
"value" : "NONE"
}, {
"name" : "dataFormatConfig.filePatternInArchive",
"value" : "*"
}, {
"name" : "dataFormatConfig.charset",
"value" : "UTF-8"
}, {
"name" : "dataFormatConfig.removeCtrlChars",
"value" : false
}, {
"name" : "dataFormatConfig.textMaxLineLen",
"value" : 1024
}, {
"name" : "dataFormatConfig.useCustomDelimiter",
"value" : false
}, {
"name" : "dataFormatConfig.customDelimiter",
"value" : "\\r\\n"
}, {
"name" : "dataFormatConfig.includeCustomDelimiterInTheText",
"value" : false
}, {
"name" : "dataFormatConfig.jsonContent",
"value" : "MULTIPLE_OBJECTS"
}, {
"name" : "dataFormatConfig.jsonMaxObjectLen",
"value" : 4096
}, {
"name" : "dataFormatConfig.csvFileFormat",
"value" : "CSV"
}, {
"name" : "dataFormatConfig.csvHeader",
"value" : "NO_HEADER"
}, {
"name" : "dataFormatConfig.csvMaxObjectLen",
"value" : 1024
}, {
"name" : "dataFormatConfig.csvCustomDelimiter",
"value" : "|"
}, {
"name" : "dataFormatConfig.csvCustomEscape",
"value" : "\\"
}, {
"name" : "dataFormatConfig.csvCustomQuote",
"value" : "\""
}, {
"name" : "dataFormatConfig.csvEnableComments",
"value" : false
}, {
"name" : "dataFormatConfig.csvCommentMarker",
"value" : "#"
}, {
"name" : "dataFormatConfig.csvIgnoreEmptyLines",
"value" : true
}, {
"name" : "dataFormatConfig.csvRecordType",
"value" : "LIST_MAP"
}, {
"name" : "dataFormatConfig.csvSkipStartLines",
"value" : 0
}, {
"name" : "dataFormatConfig.parseNull",
"value" : false
}, {
"name" : "dataFormatConfig.nullConstant",
"value" : "\\\\N"
}, {
"name" : "dataFormatConfig.xmlRecordElement",
"value" : null
}, {
"name" : "dataFormatConfig.xPathNamespaceContext",
"value" : [ ]
}, {
"name" : "dataFormatConfig.xmlMaxObjectLen",
"value" : 4096
}, {
"name" : "dataFormatConfig.logMode",
"value" : "COMMON_LOG_FORMAT"
}, {
"name" : "dataFormatConfig.logMaxObjectLen",
"value" : 1024
}, {
"name" : "dataFormatConfig.retainOriginalLine",
"value" : false
}, {
"name" : "dataFormatConfig.customLogFormat",
"value" : "%h %l %u %t \"%r\" %>s %b"
}, {
"name" : "dataFormatConfig.regex",
"value" : "^(\\S+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(\\S+) (\\S+) (\\S+)\" (\\d{3}) (\\d+)"
}, {
"name" : "dataFormatConfig.fieldPathsToGroupName",
"value" : [ {
"fieldPath" : "/",
"group" : 1
} ]
}, {
"name" : "dataFormatConfig.grokPatternDefinition",
"value" : null
}, {
"name" : "dataFormatConfig.grokPattern",
"value" : "%{COMMONAPACHELOG}"
}, {
"name" : "dataFormatConfig.onParseError",
"value" : "ERROR"
}, {
"name" : "dataFormatConfig.maxStackTraceLines",
"value" : 50
}, {
"name" : "dataFormatConfig.enableLog4jCustomLogFormat",
"value" : false
}, {
"name" : "dataFormatConfig.log4jCustomLogFormat",
"value" : "%r [%t] %-5p %c %x - %m%n"
}, {
"name" : "dataFormatConfig.avroSchemaSource",
"value" : null
}, {
"name" : "dataFormatConfig.avroSchema",
"value" : null
}, {
"name" : "dataFormatConfig.schemaRegistryUrls",
"value" : [ ]
}, {
"name" : "dataFormatConfig.schemaLookupMode",
"value" : "SUBJECT"
}, {
"name" : "dataFormatConfig.subject",
"value" : null
}, {
"name" : "dataFormatConfig.schemaId",
"value" : null
}, {
"name" : "dataFormatConfig.protoDescriptorFile",
"value" : null
}, {
"name" : "dataFormatConfig.messageType",
"value" : null
}, {
"name" : "dataFormatConfig.isDelimited",
"value" : true
}, {
"name" : "dataFormatConfig.binaryMaxObjectLen",
"value" : 1024
}, {
"name" : "dataFormatConfig.datagramMode",
"value" : "SYSLOG"
}, {
"name" : "dataFormatConfig.typesDbPath",
"value" : null
}, {
"name" : "dataFormatConfig.convertTime",
"value" : false
}, {
"name" : "dataFormatConfig.excludeInterval",
"value" : true
}, {
"name" : "dataFormatConfig.authFilePath",
"value" : null
}, {
"name" : "dataFormatConfig.wholeFileMaxObjectLen",
"value" : 8192
}, {
"name" : "dataFormatConfig.rateLimit",
"value" : "-1"
}, {
"name" : "dataFormatConfig.verifyChecksum",
"value" : false
}, {
"name" : "rawData",
"value" : "{\n \"status\": 0,\n \"results\": [\n {\n \"name\": \"StreamSets\",\n \"address\" : {\n \"street\": \"2 Bryant St\",\n \"city\": \"San Francisco\",\n \"state\": \"CA\",\n \"zip\": \"94105\"\n },\n \"phone\": \"(415) 851-1018\"\n },\n {\n \"name\": \"Salesforce\",\n \"address\" : {\n \"street\": \"1 Market St\",\n \"city\": \"San Francisco\",\n \"state\": \"CA\",\n \"zip\": \"94105\"\n },\n \"phone\": \"(415) 901-7000\"\n }\n ]\n}"
}, {
"name" : "stageOnRecordError",
"value" : "TO_ERROR"
} ],
"uiInfo" : {
"description" : "",
"label" : "Dev Raw Data Source 1",
"xPos" : 60,
"yPos" : 50,
"stageType" : "SOURCE"
},
"inputLanes" : [ ],
"outputLanes" : [ "DevRawDataSource_01OutputLane14919318303350" ],
"eventLanes" : [ ]
}, {
"instanceName" : "FieldPivoter_01",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_processor_listpivot_ListPivotDProcessor",
"stageVersion" : "2",
"configuration" : [ {
"name" : "listPath",
"value" : "/results"
}, {
"name" : "copyFields",
"value" : true
}, {
"name" : "newPath",
"value" : "/"
}, {
"name" : "saveOriginalFieldName",
"value" : false
}, {
"name" : "originalFieldNamePath",
"value" : null
}, {
"name" : "onStagePreConditionFailure",
"value" : "TO_ERROR"
}, {
"name" : "stageOnRecordError",
"value" : "TO_ERROR"
}, {
"name" : "stageRequiredFields",
"value" : [ ]
}, {
"name" : "stageRecordPreconditions",
"value" : [ ]
} ],
"uiInfo" : {
"description" : "",
"label" : "Field Pivoter 1",
"xPos" : 280,
"yPos" : 50,
"stageType" : "PROCESSOR"
},
"inputLanes" : [ "DevRawDataSource_01OutputLane14919318303350" ],
"outputLanes" : [ "FieldPivoter_01OutputLane14919462671300" ],
"eventLanes" : [ ]
}, {
"instanceName" : "FieldFlattener_01",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_processor_fieldflattener_FieldFlattenerDProcessor",
"stageVersion" : "1",
"configuration" : [ {
"name" : "config.flattenType",
"value" : "ENTIRE_RECORD"
}, {
"name" : "config.fields",
"value" : [ ]
}, {
"name" : "config.nameSeparator",
"value" : "."
}, {
"name" : "stageOnRecordError",
"value" : "TO_ERROR"
}, {
"name" : "stageRequiredFields",
"value" : [ ]
}, {
"name" : "stageRecordPreconditions",
"value" : [ ]
} ],
"uiInfo" : {
"description" : "",
"label" : "Field Flattener 1",
"xPos" : 500,
"yPos" : 50,
"stageType" : "PROCESSOR"
},
"inputLanes" : [ "FieldPivoter_01OutputLane14919462671300" ],
"outputLanes" : [ "FieldFlattener_01OutputLane14919318571250" ],
"eventLanes" : [ ]
}, {
"instanceName" : "FieldRenamer_01",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_processor_fieldrenamer_FieldRenamerDProcessor",
"stageVersion" : "2",
"configuration" : [ {
"name" : "renameMapping",
"value" : [ {
"fromFieldExpression" : "/'address\\.(.*)'",
"toFieldExpression" : "/$1"
} ]
}, {
"name" : "errorHandler.nonExistingFromFieldHandling",
"value" : "TO_ERROR"
}, {
"name" : "errorHandler.existingToFieldHandling",
"value" : "TO_ERROR"
}, {
"name" : "errorHandler.multipleFromFieldsMatching",
"value" : "TO_ERROR"
}, {
"name" : "stageOnRecordError",
"value" : "TO_ERROR"
}, {
"name" : "stageRequiredFields",
"value" : [ ]
}, {
"name" : "stageRecordPreconditions",
"value" : [ ]
} ],
"uiInfo" : {
"description" : "",
"label" : "Field Renamer 1",
"xPos" : 720,
"yPos" : 50,
"stageType" : "PROCESSOR"
},
"inputLanes" : [ "FieldFlattener_01OutputLane14919318571250" ],
"outputLanes" : [ "FieldRenamer_01OutputLane14919446468120" ],
"eventLanes" : [ ]
}, {
"instanceName" : "FieldSplitter_01",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_processor_splitter_SplitterDProcessor",
"stageVersion" : "2",
"configuration" : [ {
"name" : "fieldPath",
"value" : "/street"
}, {
"name" : "separator",
"value" : "\\s+"
}, {
"name" : "fieldPathsForSplits",
"value" : [ "/street_number", "/street_name" ]
}, {
"name" : "onStagePreConditionFailure",
"value" : "TO_ERROR"
}, {
"name" : "tooManySplitsAction",
"value" : "TO_LAST_FIELD"
}, {
"name" : "remainingSplitsPath",
"value" : null
}, {
"name" : "originalFieldAction",
"value" : "REMOVE"
}, {
"name" : "stageOnRecordError",
"value" : "TO_ERROR"
}, {
"name" : "stageRequiredFields",
"value" : [ ]
}, {
"name" : "stageRecordPreconditions",
"value" : [ ]
} ],
"uiInfo" : {
"description" : "",
"label" : "Field Splitter 1",
"xPos" : 940,
"yPos" : 50,
"stageType" : "PROCESSOR"
},
"inputLanes" : [ "FieldRenamer_01OutputLane14919446468120" ],
"outputLanes" : [ "FieldSplitter_01OutputLane14919477043810" ],
"eventLanes" : [ ]
}, {
"instanceName" : "Trash_01",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_destination_devnull_NullDTarget",
"stageVersion" : "1",
"configuration" : [ ],
"uiInfo" : {
"description" : "",
"label" : "Trash 1",
"xPos" : 1160,
"yPos" : 50,
"stageType" : "TARGET"
},
"inputLanes" : [ "FieldSplitter_01OutputLane14919477043810" ],
"outputLanes" : [ ],
"eventLanes" : [ ]
} ],
"errorStage" : {
"instanceName" : "Discard_ErrorStage",
"library" : "streamsets-datacollector-basic-lib",
"stageName" : "com_streamsets_pipeline_stage_destination_devnull_ToErrorNullDTarget",
"stageVersion" : "1",
"configuration" : [ ],
"uiInfo" : {
"description" : "",
"label" : "Error Records - Discard",
"xPos" : 60,
"yPos" : 50,
"stageType" : "TARGET"
},
"inputLanes" : [ ],
"outputLanes" : [ ],
"eventLanes" : [ ]
},
"info" : {
"name" : "85a1b191-b0f0-426e-9d6c-79d24d64a68f",
"title" : "Field Manipulations",
"description" : "",
"created" : 1491944630740,
"lastModified" : 1491949120326,
"creator" : "admin",
"lastModifier" : "admin",
"lastRev" : "0",
"uuid" : "e027d7d1-12db-4d9a-8f9a-827937c7c4df",
"valid" : true,
"metadata" : {
"labels" : [ ]
}
},
"metadata" : {
"labels" : [ ]
},
"statsAggregatorStage" : null,
"previewable" : true,
"issues" : {
"stageIssues" : { },
"pipelineIssues" : [ ],
"issueCount" : 0
},
"valid" : true
},
"pipelineRules" : {
"metricsRuleDefinitions" : [ {
"id" : "badRecordsAlertID",
"alertText" : "High incidence of Error Records",
"metricId" : "pipeline.batchErrorRecords.counter",
"metricType" : "COUNTER",
"metricElement" : "COUNTER_COUNT",
"condition" : "${value() > 100}",
"sendEmail" : false,
"enabled" : false,
"timestamp" : 1491931813412,
"valid" : true
}, {
"id" : "stageErrorAlertID",
"alertText" : "High incidence of Stage Errors",
"metricId" : "pipeline.batchErrorMessages.counter",
"metricType" : "COUNTER",
"metricElement" : "COUNTER_COUNT",
"condition" : "${value() > 100}",
"sendEmail" : false,
"enabled" : false,
"timestamp" : 1491931813412,
"valid" : true
}, {
"id" : "idleGaugeID",
"alertText" : "Pipeline is Idle",
"metricId" : "RuntimeStatsGauge.gauge",
"metricType" : "GAUGE",
"metricElement" : "TIME_OF_LAST_RECEIVED_RECORD",
"condition" : "${time:now() - value() > 120000}",
"sendEmail" : false,
"enabled" : false,
"timestamp" : 1491931813412,
"valid" : true
}, {
"id" : "batchTimeAlertID",
"alertText" : "Batch taking more time to process",
"metricId" : "RuntimeStatsGauge.gauge",
"metricType" : "GAUGE",
"metricElement" : "CURRENT_BATCH_AGE",
"condition" : "${value() > 200}",
"sendEmail" : false,
"enabled" : false,
"timestamp" : 1491931813412,
"valid" : true
}, {
"id" : "memoryLimitAlertID",
"alertText" : "Memory limit for pipeline exceeded",
"metricId" : "pipeline.memoryConsumed.counter",
"metricType" : "COUNTER",
"metricElement" : "COUNTER_COUNT",
"condition" : "${value() > (jvm:maxMemoryMB() * 0.65)}",
"sendEmail" : false,
"enabled" : false,
"timestamp" : 1491931813412,
"valid" : true
} ],
"dataRuleDefinitions" : [ ],
"driftRuleDefinitions" : [ ],
"emailIds" : [ ],
"uuid" : "2f39dbb9-e875-4b47-b3f2-5f42eed23b62",
"ruleIssues" : [ ]
},
"libraryDefinitions" : null
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment