Skip to content

Instantly share code, notes, and snippets.

hadoopsters hadoopsters

Block or report user

Report or block hadoopsters

Hide content and notifications from this user.

Learn more about blocking users

Contact Support about this user’s behavior.

Learn more about reporting abuse

Report abuse
View GitHub Profile
@hadoopsters
hadoopsters / CassandraLoader.scala
Last active Jun 27, 2019
Loading Data from Cassandra into Hadoop
View CassandraLoader.scala
import com.datastax.spark.connector.cql.CassandraConnectorConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._
object CassandraLoader extends Serializable {
/** Representative of the some_keyspace.some_table table. */
case class MyCassandraTable(user_id: String, `type`: Int, key: String, value: String)
def main(args: Array[String]) { // scalastyle:off method.length
View ExampleStreamingListener.scala
package hadoopsters.spark.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
/**
* :: ExampleStreamingListener ::
* A simple StreamingListener that accesses summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
*
View InfluxDBWriter.scala
package tv.spotx.scala.dbutils
import org.apache.logging.log4j.scala.Logging
import scalaj.http.{Http, HttpOptions, HttpResponse}
case class InfluxConfig(hostname: String = "console",
port: Int = 8086, // scalastyle:off magic.number
database: String = "devtest",
ssl: Boolean = false,
username: Option[String] = None,
View MySQLConnection.scala
package tv.spotx.scala.dbutils
import java.sql.{Connection, DriverManager}
import java.util.Properties
import org.apache.commons.pool2.impl.{DefaultPooledObject, GenericObjectPool}
import org.apache.commons.pool2.{BasePooledObjectFactory, PooledObject}
import org.apache.logging.log4j.scala.Logging
View SpotXSparkStreamingListener.scala
package tv.spotx.scala.monitoring.listeners
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.streaming.scheduler._
import org.joda.time.DateTime
import tv.spotx.scala.dbutils.{ConnectionPool, InfluxDBWriter, MySQLConnection}
/**
* :: SpotXSparkStreamingListener ::
* A simple StreamingListener that logs summary statistics across Spark Streaming batches; inherits from DeveloperAPI.
View random_proportion_control.hql
select * from my_table
where rand() <= 0.0001
distribute by rand()
sort by rand()
limit 10000;
View random_distribution.hql
select * from my_table
distribute by rand()
sort by rand()
limit 10000;
View sort_by_rand.hql
select * from my_table
sort by rand()
limit 10000;
View order_by_rand.hql
select * from my_table
order by rand()
limit 10000;
View limiting.hql
select * from my_table
limit 10000;
You can’t perform that action at this time.