Skip to content

Instantly share code, notes, and snippets.

@josnidhin
Created July 22, 2015 02:53
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save josnidhin/48d1914158c26d9520f5 to your computer and use it in GitHub Desktop.
Save josnidhin/48d1914158c26d9520f5 to your computer and use it in GitHub Desktop.
A sample AWS DataPipeline configuration which loads CSV data from S3 to DynamoDB
{
"objects": [
{
"schedule": {
"ref": "DefaultSchedule"
},
"writeThroughputPercent": "0.95",
"dataFormat": {
"ref": "DynamoDBDataFormatId_suGD7"
},
"name": "OutputDynamoDBTable",
"id": "DataNodeId_h3tjF",
"type": "DynamoDBDataNode",
"region": "eu-west-1",
"tableName": "test-import-table"
},
{
"name": "InputCSVFormat",
"column": [
"col_a string",
"col_b string",
"col_c string",
"col_d string",
"col_e string",
"col_f string",
"col_g string",
"col_h bigint",
"col_i bigint"
],
"id": "DataFormatId_yYuAT",
"type": "CSV"
},
{
"schedule": {
"ref": "DefaultSchedule"
},
"directoryPath": "s3://dynamodb-imports/test",
"dataFormat": {
"ref": "DataFormatId_yYuAT"
},
"name": "InputS3Data",
"id": "DataNodeId_KlnIq",
"type": "S3DataNode"
},
{
"failureAndRerunMode": "CASCADE",
"schedule": {
"ref": "DefaultSchedule"
},
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://dynamodb-import-logs/",
"scheduleType": "cron",
"name": "Default",
"id": "Default"
},
{
"output": {
"ref": "DataNodeId_h3tjF"
},
"input": {
"ref": "DataNodeId_KlnIq"
},
"schedule": {
"ref": "DefaultSchedule"
},
"name": "CSVImportActivity",
"hiveScript": "INSERT OVERWRITE TABLE ${output1} SELECT * FROM ${input1};",
"id": "ActivityId_lGBRA",
"runsOn": {
"ref": "EmrClusterId_gxpFR"
},
"type": "HiveActivity"
},
{
"occurrences": "1",
"period": "1 Day",
"name": "RunOnce",
"id": "DefaultSchedule",
"type": "Schedule",
"startAt": "FIRST_ACTIVATION_DATE_TIME"
},
{
"schedule": {
"ref": "DefaultSchedule"
},
"name": "TestCSVImportCluster",
"coreInstanceType": "m1.medium",
"coreInstanceCount": "1",
"masterInstanceType": "m1.medium",
"amiVersion": "3.8.0",
"id": "EmrClusterId_gxpFR",
"region": "eu-west-1",
"type": "EmrCluster",
"terminateAfter": "10 Hours"
},
{
"name": "OutputDynamoDBDataFormat",
"column": [
"col_a string",
"col_b string",
"col_c string",
"col_d string",
"col_e string",
"col_f string",
"col_g string",
"col_h bigint",
"col_i bigint"
],
"id": "DynamoDBDataFormatId_suGD7",
"type": "DynamoDBDataFormat"
}
],
"parameters": []
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment