Skip to content

Instantly share code, notes, and snippets.

@tilakpatidar
Created March 24, 2018 07:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tilakpatidar/29da7be217fd4e7175b7b9ea6bf4735a to your computer and use it in GitHub Desktop.
Save tilakpatidar/29da7be217fd4e7175b7b9ea6bf4735a to your computer and use it in GitHub Desktop.
Apache Gobblin pull CSVs from S3 storage and write to AVRO
# ====================================================================
# PullCsvFromS3
# Pull CSV data from a directory S3 to our local system
# ====================================================================
job.name=PullCsvFromS3
job.description=Pull CSV data from a directory S3 to our local system and write as AVRO files
fs.uri=file:///
# Set working directory
work.dir=/Users/tilak/gobblin/mopar-demo
writer.staging.dir=${work.dir}/taskStaging
writer.output.dir=${work.dir}/taskOutput
mr.job.root.dir=${work.dir}/working
# Set state store
state.store.enabled=true
state.store.type=mysql
state.store.db.jdbc.driver=com.mysql.jdbc.Driver
state.store.db.url=jdbc:mysql://localhost/mopar_demo
state.store.db.user=gobblin
state.store.db.password=gobblin
# Set writer and publisher
writer.fs.uri=file:///
data.publisher.final.dir=${work.dir}/output
writer.destination.type=HDFS
writer.output.format=AVRO
writer.builder.class=org.apache.gobblin.writer.AvroDataWriterBuilder
data.publisher.fs.uri=${fs.uri}
data.publisher.type=org.apache.gobblin.publisher.BaseDataPublisher
data.publisher.metadata.output.dir=${work.dir}/metadata_out
# Source Configuration
source.class=org.apache.gobblin.data.management.copy.CopySource
gobblin.dataset.profile.class=org.apache.gobblin.data.management.copy.CopyableGlobDatasetFinder
gobblin.dataset.pattern=pricing.products_*.csv
# To copy from particular directory gobblin.dataset.pattern=some_folder/*.csv
gobblin.copy.recursive.update=true
# Source S3 Configuration
source.filebased.fs.uri=s3a://<bucket-name>
source.filebased.preserve.file.name=true
source.filebased.encrypted.fs.s3a.access.key=<s3-access-key>
source.filebased.encrypted.fs.s3a.secret.key=<s3-secret-key>
fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem
fs.s3a.buffer.dir=${work.dir}/buffer-dir
fs.s3a.connection.ssl.enabled=false
# Converters
source.schema={"namespace":"pricing", "type":"record", "name":"Products", "fields":[ { "name": "id", "type":"string" }, { "name": "description", "type":"string" }, { "name": "additional_description", "type":"string" }]}
csv.has.headers=true
converter.classes=org.apache.gobblin.data.management.copy.converter.ReadBatchedCSVConverter
# ====================================================================
# Distcp configurations (do not change)
# ====================================================================
job.class=org.apache.gobblin.azkaban.AzkabanJobLauncher
extract.namespace=org.apache.gobblin.copy
distcp.persist.dir=/tmp/distcp-persist-dir
task.maxretries=0
workunit.retry.enabled=false
# Job History server
job.history.store.enabled=true
job.history.store.url=jdbc:mysql://localhost/mopar_demo
job.history.store.jdbc.driver=com.mysql.jdbc.Driver
job.history.store.user=gobblin
job.history.store.password=gobblin
# Other s3a settings
# Should be greater than 5MB else distcp won't work
fs.s3a.multipart.size=67108864
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment