Created
July 27, 2016 16:06
-
-
Save anonymous/b3a8cbe3797bda4efda7aa2464f42124 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aws: | |
# Credentials can be hardcoded or set in environment variables | |
access_key_id: 'XXXXX' | |
secret_access_key: 'XXXXX' | |
s3: | |
region: eu-west-1a | |
buckets: | |
assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket | |
jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here | |
log: s3://snwplw-enrich-logs | |
raw: | |
in: # Multiple in buckets are permitted | |
- snwlpk-logs # e.g. s3://my-in-bucket | |
# - ADD HERE | |
processing: s3://snwplw-enrich-processing | |
archive: s3://snwplw-enrich-archive # e.g. s3://my-archive-bucket/raw | |
enriched: | |
good: s3://snwplw-enriched-data/good # e.g. s3://my-out-bucket/enriched/good | |
bad: s3://snwplw-enriched-data/bad # e.g. s3://my-out-bucket/enriched/bad | |
errors: # Leave blank unless :continue_on_unexpected_error: set to true below | |
archive: s3://snwplw-enriched-data/archive # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched | |
shredded: | |
good: s3://snwplw-shredded-data/good # e.g. s3://my-out-bucket/shredded/good | |
bad: s3://snwplw-shredded-data/bad # e.g. s3://my-out-bucket/shredded/bad | |
errors: # Leave blank unless :continue_on_unexpected_error: set to true below | |
archive: s3://snwplw-shredded-data/archive # Where to archive shredded events to, e.g. s3://my-archive-bucket/shredded | |
emr: | |
ami_version: 4.5.0 | |
region: eu-west-1a # Always set this | |
jobflow_role: EMR_EC2_DefaultRole # Created using $ aws emr create-default-roles | |
service_role: EMR_DefaultRole # Created using $ aws emr create-default-roles | |
placement: # Set this if not running in VPC. Leave blank otherwise | |
ec2_subnet_id: subnet-4f0cc717 # Set this if running in VPC. Leave blank otherwise | |
ec2_key_name: ec2-snwplw-enr | |
bootstrap: [] # Set this to specify custom boostrap actions. Leave empty otherwise | |
software: | |
hbase: # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise. | |
lingual: # Optional. To launch on cluster, provide version, "1.1", keep quotes. Leave empty otherwise. | |
# Adjust your Hadoop cluster below | |
jobflow: | |
master_instance_type: m1.medium | |
core_instance_count: 2 | |
core_instance_type: m1.medium | |
task_instance_count: 0 # Increase to use spot instances | |
task_instance_type: m1.medium | |
task_instance_bid: 0.015 # In USD. Adjust bid, or leave blank for non-spot-priced (i.e. on-demand) task instances | |
bootstrap_failure_tries: 3 # Number of times to attempt the job in the event of bootstrap failures | |
additional_info: # Optional JSON string for selecting additional features | |
collectors: | |
format: cloudfront # For example: 'clj-tomcat' for the Clojure Collector, 'thrift' for Thrift records, 'tsv/com.amazon.aws.cloudfront/wd_access_log' for Cloudfront access logs or 'ndjson/urbanairship.connect/v1' for UrbanAirship Connect events | |
enrich: | |
job_name: Snowplow ETL # Give your job a name | |
versions: | |
hadoop_enrich: 1.7.0 # Version of the Hadoop Enrichment process | |
hadoop_shred: 0.9.0 # Version of the Hadoop Shredding process | |
hadoop_elasticsearch: 0.1.0 # Version of the Hadoop to Elasticsearch copying process | |
continue_on_unexpected_error: false # Set to 'true' (and set :out_errors: above) if you don't want any exceptions thrown from ETL | |
output_compression: NONE # Compression only supported with Redshift, set to NONE if you have Postgres targets. Allowed formats: NONE, GZIP | |
storage: | |
download: | |
folder: # Postgres-only config option. Where to store the downloaded files. Leave blank for Redshift | |
targets: | |
- name: "snwplw" | |
type: redshift | |
host: rs-snwplw-db.cv7vgkektjyu.eu-west-1.redshift.amazonaws.com:5439 # The endpoint as shown in the Redshift console | |
database: snwplw # Name of database | |
port: 5439 # Default Redshift port | |
ssl_mode: disable # One of disable (default), require, verify-ca or verify-full | |
table: atomic.events | |
username: XXXXX | |
password: XXXXX | |
maxerror: 1 # Stop loading on first error, or increase to permit more load errors | |
comprows: 200000 # Default for a 1 XL node cluster. Not used unless --include compupdate specified | |
monitoring: | |
tags: {} # Name-value pairs describing this job | |
logging: | |
level: DEBUG # You can optionally switch to INFO for production | |
snowplow: | |
method: get | |
app_id: snowplow # e.g. snowplow | |
collector: d7y6axreyshzs.cloudfront.net # e.g. d3rkrsqld9gmqf.cloudfront.net |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment