Skip to content

Instantly share code, notes, and snippets.

@ankurdave
ankurdave / test-py.html
Last active June 16, 2017 07:14
color-identifiers-mode #29
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
<!-- Created by htmlize-1.51 in inline-css mode. -->
<html>
<head>
<title>test.py</title>
</head>
<body style="color: #DCDCCC; background-color: #3F3F3F;">
<pre>
<span style="color: #F0DFAF; font-weight: bold;">def</span> <span style="color: #93E0E3;">test1</span>(<span style="color: #fff099;">a</span>, <span style="color: #c2b6e1;">b</span>):
<span style="color: #F0DFAF; font-weight: bold;">return</span> <span style="color: #fff099;">a</span> + <span style="color: #c2b6e1;">b</span>
@ankurdave
ankurdave / juju-demo.sh
Created June 17, 2016 18:27
RISE systems seminar: Juju demo
brew install juju
juju generate-config
export AWS_ACCESS_KEY_ID=...
export AWS_SECRET_ACCESS_KEY=...
juju bootstrap
juju status
juju deploy juju-gui --to 0
juju expose juju-gui
# Wait until `juju status` shows it has started
@ankurdave
ankurdave / spark-nested-groupBy.scala
Created April 26, 2016 06:09
Work around the 2 billion element limit for Spark's groupByKey using nested groupBys
val rdd = sc.parallelize((0 until 1000).map(x => (1, x)) ++ List((2,1), (2,2)))
// rdd: org.apache.spark.rdd.RDD[(Int, Int)]
rdd.collect
// res1: Array[(Int, Int)] = Array((1,0), (1,1), (1,2), (1,3), (1,4), (1,5), (1,6), (1,7), (1,8), (1,9), (1,10), (1,11), (1,12), (1,13), (1,14), (1,15), (1,16), (1,17), (1,18), (1,19), (1,20), (1,21), (1,22), (1,23), (1,24), (1,25), (1,26), (1,27), (1,28), (1,29), (1,30), (1,31), (1,32), (1,33), (1,34), (1,35), (1,36), (1,37), (1,38), (1,39), (1,40), (1,41), (1,42), (1,43), (1,44), (1,45), (1,46), (1,47), (1,48), (1,49), (1,50), (1,51), (1,52), (1,53), (1,54), (1,55), (1,56), (1,57), (1,58), (1,59), (1,60), (1,61), (1,62), (1,63), (1,64), (1,65), (1,66), (1,67), (1,68), (1,69), (1,70), (1,71), (1,72), (1,73), (1,74), (1,75), (1,76), (1,77), (1,78), (1,79), (1,80), (1,81), (1,82), (1,83), (1,84), (1,85), (1,86), (1,87), (1,88), (1,89), (1,90), (1,91), (1,92), (1,93), (1,94), (1,95), (1,96),...
val nestedGroups = rdd.groupBy(kv => (kv._1, kv._2 % 10)).groupBy(_._1._1).map(_._2
@ankurdave
ankurdave / graphx-bfs.scala
Created February 12, 2016 23:06
BFS in GraphX
import org.apache.spark.graphx._
/**
* Returns the shortest directed-edge path from src to dst in the graph. If no path exists, returns
* the empty list.
*/
def bfs[VD, ED](graph: Graph[VD, ED], src: VertexId, dst: VertexId): Seq[VertexId] = {
if (src == dst) return List(src)
// The attribute of each vertex is (dist from src, id of vertex with dist-1)
@ankurdave
ankurdave / A.java
Created October 10, 2015 03:00
Check that Scala autoboxes primitives before passing them to Java generic classes. Run with `sbt run`
public class A<T> {
public A(T t) {
this.t = t;
}
public T t;
public void print() {
System.out.println(t.getClass().getSimpleName() + " " + t);
}
}
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DataFrame
val conf = new SparkConf()
val sc = new SparkContext("local", "test")
val sqlContext = new SQLContext(sc)
val v = sqlContext.createDataFrame(List(
@ankurdave
ankurdave / build.sbt
Created February 14, 2015 03:51
SBT project configuration to build against Spark and GraphX
name := "my-project"
version := "0.1-SNAPSHOT"
organization := "com.example"
scalaVersion := "2.10.4"
libraryDependencies += "org.apache.spark" %% "spark-core" % "1.2.1"
@ankurdave
ankurdave / subgraphWithNeighbors.scala
Created February 3, 2015 00:51
Find a subgraph containing only vertices in a specified set, plus their neighbors
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx._
import scala.reflect.ClassTag
/** Returns the subgraph of `graph` containing only `vertices` and their neighbors. */
def subgraphWithNeighbors[VD, ED: ClassTag, A: ClassTag](
graph: Graph[VD, ED], vertices: RDD[(VertexId, A)]): Graph[VD, ED] = {
// Label each vertex in graph with true if it is a member of `vertices` and false if not
val labeledGraph = graph.outerJoinVertices(vertices) {
(id, oldAttr, isSampled) => isSampled.nonEmpty
@ankurdave
ankurdave / fringe-set.scala
Created December 5, 2014 10:22
Find the fringe set for each vertex using GraphX
// Depends on AllPairsShortestPaths: https://github.com/apache/spark/pull/3619
import org.apache.spark.graphx._
import org.apache.spark.graphx.lib._
val edges = sc.parallelize((0 until 10).map(x => Edge(x, x + 1, 1)))
val graph = Graph.fromEdges(edges, 1)
val dists = AllPairsShortestPaths.run(graph).cache()
val maxDists = dists.mapValues(_._2).reduceByKey((a, b) => if (a > b) a else b)
@ankurdave
ankurdave / gist:cb89391101e4e87497ae
Last active August 29, 2015 14:10
Interface between GraphX and SampleClean
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Row
import scala.reflect.ClassTag
// Take edges and build a graph
def a(vertices: RDD[(Long, Row)], edges: RDD[(Long, Long)]): Graph[Row, Unit] =
Graph(vertices, edges.map(pair => Edge(pair._1, pair._2, Unit)))
// Run connected components on the graph