Skip to content

Instantly share code, notes, and snippets.

@hadoopsters
hadoopsters / CassandraLoader.scala
Last active Jan 19, 2020
Loading Data from Cassandra into Hadoop
View CassandraLoader.scala
import com.datastax.spark.connector.cql.CassandraConnectorConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._
object CassandraLoader extends Serializable {
/** Representative of the some_keyspace.some_table table. */
case class MyCassandraTable(user_id: String, `type`: Int, key: String, value: String)
def main(args: Array[String]) { // scalastyle:off method.length
View ExampleStreamingListener.scala
package hadoopsters.spark.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
/**
* :: ExampleStreamingListener ::
* A simple StreamingListener that accesses summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
*
View InfluxDBWriter.scala
package tv.spotx.scala.dbutils
import org.apache.logging.log4j.scala.Logging
import scalaj.http.{Http, HttpOptions, HttpResponse}
case class InfluxConfig(hostname: String = "console",
port: Int = 8086, // scalastyle:off magic.number
database: String = "devtest",
ssl: Boolean = false,
username: Option[String] = None,
View MySQLConnection.scala
package tv.spotx.scala.dbutils
import java.sql.{Connection, DriverManager}
import java.util.Properties
import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.commons.pool2.{BasePooledObjectFactory, PooledObject}
import org.apache.logging.log4j.scala.Logging
View SpotXSparkStreamingListener.scala
package tv.spotx.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
import tv.spotx.scala.dbutils.{ConnectionPool, InfluxDBWriter, MySQLConnection}
/**
* :: SpotXSparkStreamingListener ::
* A simple StreamingListener that logs summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
View random_proportion_control.hql
select * from my_table
where rand() <= 0.0001
distribute by rand()
sort by rand()
limit 10000;
View random_distribution.hql
select * from my_table
distribute by rand()
sort by rand()
limit 10000;
View sort_by_rand.hql
select * from my_table
sort by rand()
limit 10000;
View order_by_rand.hql
select * from my_table
order by rand()
limit 10000;
View limiting.hql
select * from my_table
limit 10000;
You can’t perform that action at this time.