Skip to content

Instantly share code, notes, and snippets.

View l15k4's full-sized avatar
🎯
Focusing

Jakub Liska l15k4

🎯
Focusing
View GitHub Profile
package com.example
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerSettings}
import akka.stream.scaladsl.{Sink, Source}
import kafka.utils.ZkUtils
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.scalatest.BeforeAndAfterEach
package com.example
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerSettings}
import akka.stream.scaladsl.{Sink, Source}
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.scalatest.time.{Second, Seconds, Span}
def runHybrid(s3Keys: Seq[String], parallelism: Int): Stats = {
sc.parallelize(s3Keys, parallelism)
.mapPartitions { keys =>
val s3 = IO.getS3(s3Id, s3Key, s3Region)
def read = keys.map(key => key -> MiLogFileParser.getBytes(s3.getObject(inputBucket, key).getObjectContent, true))
val(contentByKey, took) = IO.profile( () => read )
readingAcc.add(took)
contentByKey
package com.example
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerSettings}
import akka.stream.scaladsl.{Sink, Source}
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.clients.producer.ProducerRecord
import org.apache.kafka.common.serialization.{ByteArrayDeserializer, ByteArraySerializer, StringDeserializer, StringSerializer}
import org.scalatest.time.{Second, Seconds, Span}
import scala.concurrent.duration._
trait Storage
case class ViewId(id: String, access: String = "rw", base: Boolean = false)
case class S3Storage(bucket: String, path: String) extends Storage
case class DruidStorage(coordinatorHost: String, overlordHost: String, dataSource: String) extends Storage
sealed trait Partitioning
#
# Extensions
#
druid.extensions.directory=dist/druid/extensions
druid.extensions.hadoopDependenciesDir=dist/druid/hadoop-dependencies
#
# Logging
#
package com.example
import java.util.Properties
import akka.event.Logging
import akka.kafka.scaladsl.{Consumer, Producer}
import akka.kafka.{ConsumerSettings, ProducerSettings}
import akka.stream.scaladsl.{Sink, Source}
import net.manub.embeddedkafka.{EmbeddedKafka, EmbeddedKafkaConfig}
import org.apache.kafka.clients.consumer.ConsumerConfig
{
"task": "index_hadoop_gwiq_2016-05-02T07:27:30.883Z",
"payload": {
"id": "index_hadoop_gwiq_2016-05-02T07:27:30.883Z",
"spec": {
"dataSchema": {
"dataSource": "gwiq",
"parser": {
"type": "hadoopyString",
"parseSpec": {
docker-machine create \
--driver amazonec2 \
--amazonec2-access-key ${AWS_ACCESS_KEY_ID} \
--amazonec2-secret-key ${AWS_SECRET_ACCESS_KEY} \
--amazonec2-region ${REGION} \
# --amazonec2-private-address-only \
--amazonec2-vpc-id vpc-fcf8e29e \
--amazonec2-instance-type ${INSTANCE_TYPE} \
--amazonec2-root-size ${VOLUME_SIZE} \
--amazonec2-security-group ${SECURITY_GROUP} \
val exclusions = List( // removed cid c-geo:c3
"timestamp","d_memberID","gwid","diid","c-geo:st","c-geo:ac","c-geo:c2","c-geo:cn","c-geo:ct","c-geo:dc","c-geo:la","c-geo:lo","c-geo:pc","c-geo:rc","c-geo:sp",
"ua","ua-cp","ua-os","accept","accept-charset","accept-encoding","accept-language","connection","from","x-wap-profile","x-att-deviceid","via","x-forwarded-for","x-uidh","forwarded","referrer","c-ip","ce","count","ctr","logic","max","miid","msg","npid","cs-uri-stem"
)
val segmentGrn = Granularity.DAY
private def hadoopTask(interval: String) =
IndexTask(
IndexTask.hadoopType,
IngestionSpec(