Skip to content

Instantly share code, notes, and snippets.

@moomindani
Created November 24, 2014 00:28
Show Gist options
  • Save moomindani/995ddb38ec24533a358a to your computer and use it in GitHub Desktop.
Save moomindani/995ddb38ec24533a358a to your computer and use it in GitHub Desktop.
Import CloudFront log into Redshift using Data Pipeline
{
"objects": [
{
"id": "Default",
"scheduleType": "TIMESERIES",
"failureAndRerunMode": "CASCADE",
"name": "Default",
"pipelineLogUri": "s3://path_to_log",
"role": "DataPipelineDefaultRole",
"resourceRole": "DataPipelineDefaultResourceRole"
},
{
"id": "ScheduleId1",
"name": "DefaultSchedule1",
"startAt": "FIRST_ACTIVATION_DATE_TIME",
"type": "Schedule",
"period": "15 Minutes"
},
{
"id": "S3DataNodeCloudFrontLog",
"name": "S3DataNodeCloudFrontLog",
"type": "S3DataNode",
"directoryPath": "s3://path_to_cloudfront_log",
"schedule": {
"ref": "ScheduleId1"
}
},
{
"id": "RedshiftDatabaseId1",
"name": "DefaultRedshiftDatabase1",
"type": "RedshiftDatabase",
"clusterId": "hoge",
"databaseName": "dev",
"username": "hogehoge",
"*password": "fugafuga"
},
{
"id": "RedshiftDataNodeCloudFrontLog",
"name": "RedshiftDataNodeCloudFrontLog",
"type": "RedshiftDataNode",
"database": {
"ref": "RedshiftDatabaseId1"
},
"tableName": "cloudfront_access_logs",
"createTableSql": "CREATE TABLE cloudfront_access_logs ( request_date VARCHAR(MAX) SORTKEY, request_time VARCHAR(MAX) ENCODE LZO, x_edge_location VARCHAR(40) ENCODE LZO, sc_bytes INT ENCODE LZO, remote_ip VARCHAR(50) ENCODE LZO, cs_method VARCHAR(50) ENCODE LZO, cs_host VARCHAR(MAX) ENCODE LZO, cs_uri_stem VARCHAR(MAX) DISTKEY ENCODE LZO, sc_status VARCHAR(20) ENCODE LZO, cs_referrer VARCHAR(MAX) ENCODE LZO, cs_useragent VARCHAR(MAX) ENCODE LZO, cs_uri_query VARCHAR(MAX) ENCODE LZO, cs_cookie VARCHAR(MAX) ENCODE LZO, x_edge_result_type VARCHAR(MAX) ENCODE LZO, x_edge_request_id VARCHAR(MAX) ENCODE LZO, x_host_header VARCHAR(MAX) ENCODE LZO, cs_protocol VARCHAR(10) ENCODE LZO, cs_bytes INT ENCODE LZO, time_taken VARCHAR(MAX) ENCODE LZO);",
"schedule": {
"ref": "ScheduleId1"
}
},
{
"id": "RedshiftCopyActivityCloudFrontLog",
"name": "RedshiftCopyActivityCloudFrontLog",
"type": "RedshiftCopyActivity",
"input": {
"ref": "S3DataNodeCloudFrontLog"
},
"commandOptions": [
"DELIMITER '\t'",
"IGNOREHEADER 2",
"TRUNCATECOLUMNS",
"TRIMBLANKS",
"ACCEPTINVCHARS",
"MAXERROR as 100000",
"gzip"
],
"insertMode": "TRUNCATE",
"runsOn": {
"ref": "Ec2ResourceId1"
},
"schedule": {
"ref": "ScheduleId1"
},
"output": {
"ref": "RedshiftDataNodeCloudFrontLog"
}
},
{
"id": "Ec2ResourceId1",
"name": "DefaultEc2Resource1",
"type": "Ec2Resource",
"terminateAfter": "1 HOURS",
"schedule": {
"ref": "ScheduleId1"
},
"securityGroups": "default",
"logUri": "s3://path_to_log",
"role": "DataPipelineDefaultRole",
"resourceRole": "DataPipelineDefaultResourceRole"
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment