Skip to content

Instantly share code, notes, and snippets.

View vgiri2015's full-sized avatar

Giri R Varatharajan vgiri2015

View GitHub Profile
select A.id as id, count(*) as key from table A group by id order by key desc;
import org.apache.spark.sql.SparkSession
/**
* Created by vgiridatabricks on 3/24/18.
*/
object SparkXMLBlob {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("Spark Blob Data in XML")
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
/**
* Created by vgiridatabricks on 2/1/17.
*/
object SparkMultiThreading {
def main(args: Array[String]): Unit = {
val spark = SparkSession
#Create a Method to handle the Non Ascii to Ascii conversion
def nonasciitoascii(unicodestring):
return unicodestring.encode("ascii","ignore")
#Create a Sample Dataframe
from pyspark.sql.window import Window
from pyspark.sql.functions import count, col
from pyspark.sql import Row
d=[ Row(coltype='regular', value="Happy Coding"),
Row(coltype='non ascii', value="hello aåbäcö"),
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by vgiridatabricks on 8/19/16.
*/
object WholeStageCodeGenExample {
def main(args: Array[String]): Unit = {
@vgiri2015
vgiri2015 / Spark2.0FileCompression.scala
Created August 19, 2016 03:30
File Compression in Spark 2.0
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by vgiridatabricks on 8/13/16.
*/
object FileCompression {
case class DataFrameSample(name: String, actor: String, episodeDebut: String)
@vgiri2015
vgiri2015 / spark_basic_build.sbt
Created August 9, 2016 06:04
spark_final_build_sbt
name := "Spark2.0-and-greater"
version := "1.0"
//Older Scala Version
scalaVersion := "2.11.8"
val overrideScalaVersion = "2.11.8"
val sparkVersion = "2.0.0"
val sparkXMLVersion = "0.3.3"
df = sc.parallelize([(1, 'Y','F',"Giri",'Y'), (2, 'N','V',"Databricks",'N'),(3,'Y','B',"SparkEdge",'Y'),(4,'N','X',"Spark",'N')]).toDF(["id", "flag1","flag2","name","flag3"])
print 'Show Dataframe'
df.show()
print 'Actual Schema of the df'
df.printSchema()
for a_dftype in df.dtypes:
col_name = a_dftype[0]
col_type = a_dftype[1]
# print df.select(col_name).collect()[0][0]
package rnd
import kafka.serializer.StringDecoder
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Minutes, Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object KafkaSparkStreamingToES {
package rnd
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by vgiridatabricks on 5/26/16.
*/
object NetcatSparkStreamingToESIndex {