Skip to content

Instantly share code, notes, and snippets.

@ibuenros
Last active August 30, 2016 16:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ibuenros/3cb4c9293edc7f43ab41c0d0d59cb586 to your computer and use it in GitHub Desktop.
Save ibuenros/3cb4c9293edc7f43ab41c0d0d59cb586 to your computer and use it in GitHub Desktop.
# Job metadata
job.name=PullFromWikipediaToKafka
job.group=Wikipedia
job.description=Pull from Wikipedia and write to Kafka
# Schedule
job.schedule=0 0/2 * * * ?
# Source configuration
extract.namespace=gobblin.example.wikipedia
source.class=gobblin.example.wikipedia.WikipediaSource
# Wikipedia source configuration
source.page.titles=LinkedIn,Wikipedia:Sandbox
gobblin.wikipediaSource.maxRevisionsPerPage=20
wikipedia.api.rooturl=https://en.wikipedia.org/w/api.php
wikipedia.avro.schema={"namespace": "example.wikipedia.avro","type": "record","name": "WikipediaArticle","fields": [{"name": "pageid", "type": ["double", "null"]},{"name": "title", "type": ["string", "null"]},{"name": "user", "type": ["string", "null"]},{"name": "anon", "type": ["string", "null"]},{"name": "userid", "type": ["double", "null"]},{"name": "timestamp", "type": ["string", "null"]},{"name": "size", "type": ["double", "null"]},{"name": "contentformat", "type": ["string", "null"]},{"name": "contentmodel", "type": ["string", "null"]}]}
wikipedia.source.bootstrap.lookback=P10D
# Converter configuration
converter.classes=gobblin.converter.json.JsonToStringConverter
# Writer configuration
writer.builder.class=gobblin.kafka.writer.KafkaDataWriterBuilder
writer.kafka.topic=WikipediaExample
writer.kafka.producerConfig.bootstrap.servers=localhost:9092
writer.kafka.producerConfig.value.serializer=org.apache.kafka.common.serialization.StringSerializer
writer.output.format=TEXT
# Publisher Configuration
data.publisher.type=gobblin.publisher.NoopPublisher
##Use Confluent Schema Registry and serializers
# writer.kafka.producerConfig.value.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer
# writer.kafka.producerConfig.key.serializer=io.confluent.kafka.serializers.KafkaAvroSerializer
# writer.kafka.producerConfig.schema.registry.url=http://localhost:8081
#Use Local Schema Registry and serializers
#writer.kafka.producerConfig.value.serializer=gobblin.kafka.serialize.LiAvroSerializer
#writer.kafka.producerConfig.kafka.schemaRegistry.class=gobblin.kafka.schemareg.ConfigDrivenMd5SchemaRegistry
#writer.kafka.producerConfig.schemaRegistry.schema.name=WikipediaExample
#writer.kafka.producerConfig.schemaRegistry.schema.value={"namespace": "example.wikipedia.avro","type": "record","name": "WikipediaArticle","fields": [{"name": "pageid", "type": ["double", "null"]},{"name": "title", "type": ["string", "null"]},{"name": "user", "type": ["string", "null"]},{"name": "anon", "type": ["string", "null"]},{"name": "userid", "type": ["double", "null"]},{"name": "timestamp", "type": ["string", "null"]},{"name": "size", "type": ["double", "null"]},{"name": "contentformat", "type": ["string", "null"]},{"name": "contentmodel", "type": ["string", "null"]},{"name": "content", "type": ["string", "null"]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment