Skip to content

Instantly share code, notes, and snippets.

View skp33's full-sized avatar

Kaushal Prajapati skp33

View GitHub Profile
implicit class DataFrameExtended(df: DataFrame) {
import df.sqlContext.implicits._
def anyNull(cols: Seq[Column]): Column = cols.map(_.isNull).reduce (_ || _)
/**
* LEFT JOIN should not join anything when join-key contains a NULL (but usually this
* would result in shuffling NULL keyed items into single or few reducers).
* This can be easily fixed by adding an additional temporary join condition that:
* - is a random seed when any of the keys is null, thus addressing the NULL skew
@skp33
skp33 / SparkUtils.scala
Created November 20, 2018 12:23 — forked from ibuenros/SparkUtils.scala
Spark productionizing utilities developed by Ooyala, shown in Spark Summit 2014
//==================================================================
// SPARK INSTRUMENTATION
//==================================================================
import com.codahale.metrics.{MetricRegistry, Meter, Gauge}
import org.apache.spark.{SparkEnv, Accumulator}
import org.apache.spark.metrics.source.Source
import org.joda.time.DateTime
import scala.collection.mutable
@skp33
skp33 / PointType.scala
Created August 30, 2018 20:24 — forked from sadikovi/PointType.scala
Spark UDT and UDAF with custom buffer type
package org.apache.spark
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.types._
@SQLUserDefinedType(udt = classOf[PointType])
case class Point(mac: String, start: Long, end: Long) {
override def hashCode(): Int = {
31 * (31 * mac.hashCode) + start.hashCode
}
@skp33
skp33 / SudokuSolver.scala
Created July 27, 2018 17:19 — forked from pathikrit/SudokuSolver.scala
Sudoku Solver in Scala
val n = 9
val s = Math.sqrt(n).toInt
type Board = IndexedSeq[IndexedSeq[Int]]
def solve(board: Board, cell: Int = 0): Option[Board] = (cell%n, cell/n) match {
case (r, `n`) => Some(board)
case (r, c) if board(r)(c) > 0 => solve(board, cell + 1)
case (r, c) =>
def cells(i: Int) = Seq(board(r)(i), board(i)(c), board(s*(r/s) + i/s)(s*(c/s) + i%s))
def guess(x: Int) = solve(board.updated(r, board(r).updated(c, x)), cell + 1)
@skp33
skp33 / Deserialization.scala
Created April 6, 2018 18:22 — forked from ramn/Deserialization.scala
Object serialization example in Scala
import java.io._
@SerialVersionUID(15L)
class Animal(name: String, age: Int) extends Serializable {
override def toString = s"Animal($name, $age)"
}
case class Person(name: String)
// or fork := true in sbt
@skp33
skp33 / CogroupDf.scala
Created March 29, 2018 18:57 — forked from ahoy-jon/CogroupDf.scala
DataFrame.cogroup is the new HList.flatMap (UNFORTUNATELY, THIS IS VERY SLOW)
package org.apache.spark.sql.utils
import org.apache.spark.Partitioner
import org.apache.spark.rdd.{CoGroupedRDD, RDD}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, ScalaReflection}
import org.apache.spark.sql.execution.LogicalRDD
import org.apache.spark.sql.types.{ArrayType, StructField, StructType}
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag
@skp33
skp33 / reactive_map.js
Created March 17, 2018 15:19 — forked from granturing/reactive_map.js
Sample reactive Leaflet code for Zeppelin
<!-- place this in an %angular paragraph -->
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/leaflet/0.7.5/leaflet.css" />
<div id="map" style="height: 800px; width: 100%"></div>
<script type="text/javascript">
function initMap() {
var map = L.map('map').setView([30.00, -30.00], 3);
L.tileLayer('http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', {
@skp33
skp33 / BasicAuthenticationFilter.java
Created December 8, 2015 12:48 — forked from neolitec/BasicAuthenticationFilter.java
HTTP Basic authentication Java filter
package com.neolitec.examples;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.servlet.*;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
@skp33
skp33 / MovingAvgSpark.scala
Last active August 29, 2015 14:27 — forked from samklr/MovingAvgSpark.scala
Moving Average on stock prices in Spark with custom partitioner
val ts = sc.parallelize(0 to 100, 10)
val window = 3
class StraightPartitioner(p: Int) extends Partitioner {
def numPartitions = p
def getPartition(key: Int) = key * p/0.5
}
val partitioned = ts.mapPartitionsWithIndex((i, p) => {
val overlap = p.take(window - 1).toArray