Skip to content

Instantly share code, notes, and snippets.

name animal age color
fido 1 4 1
annabelle 2 15 2
fred 3 29 1
julie 4 1 1
gus 5 1 4
daisy 6 2 5
name animal age color
fido dog 4 brown
annabelle cat 15 white
fred bear 29 brown
julie parrot 1 brown
gus fish 1 gold
daisy iguana 2 green
@hadoopsters
hadoopsters / CassandraLoader.scala
Last active January 19, 2020 19:16
Loading Data from Cassandra into Hadoop
import com.datastax.spark.connector.cql.CassandraConnectorConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._
object CassandraLoader extends Serializable {
/** Representative of the some_keyspace.some_table table. */
case class MyCassandraTable(user_id: String, `type`: Int, key: String, value: String)
def main(args: Array[String]) { // scalastyle:off method.length
package hadoopsters.spark.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
/**
* :: ExampleStreamingListener ::
* A simple StreamingListener that accesses summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
*
package tv.spotx.scala.dbutils
import org.apache.logging.log4j.scala.Logging
import scalaj.http.{Http, HttpOptions, HttpResponse}
case class InfluxConfig(hostname: String = "console",
port: Int = 8086, // scalastyle:off magic.number
database: String = "devtest",
ssl: Boolean = false,
username: Option[String] = None,
package tv.spotx.scala.dbutils
import java.sql.{Connection, DriverManager}
import java.util.Properties
import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.commons.pool2.{BasePooledObjectFactory, PooledObject}
import org.apache.logging.log4j.scala.Logging
package tv.spotx.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
import tv.spotx.scala.dbutils.{ConnectionPool, InfluxDBWriter, MySQLConnection}
/**
* :: SpotXSparkStreamingListener ::
* A simple StreamingListener that logs summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
select * from my_table
where rand() <= 0.0001
distribute by rand()
sort by rand()
limit 10000;
select * from my_table
distribute by rand()
sort by rand()
limit 10000;
select * from my_table
sort by rand()
limit 10000;