View MapRawData.scala
package com.dvidr.counts
import com.twitter.algebird.{HLL, HyperLogLogMonoid}
import org.apache.spark.rdd.RDD
case class EmailSchema(sender: String,
to: String,
cc: String,
bcc: String,
sentDate: String,
View Vagrantfile
Vagrant.configure("2") do |config|
config.vm.box = "ubuntu/xenial64"
config.vm.hostname = "spark.xenial.box"
config.vm.network :private_network, ip: "192.168.0.42"
config.vm.synced_folder "./data", "/vagrant_data"
config.vm.provider "virtualbox" do |vb|
vb.gui = false
vb.memory = 4096
View MapperLMDB.java
/**
* gradle clean
* gradle build
* <p>
* hadoop jar build/libs/mapper-lmdb-1.0-SNAPSHOT.jar com.dvidr.MapperLMDB src/main/resources/keys.txt src/main/resources/output
*/
package com.dvidr;
import org.apache.hadoop.conf.Configuration;
View PivotTable.java
/**
* gradle clean
* gradle build
*
* hadoop jar build/libs/pivot-table-1.0-SNAPSHOT.jar com.dvidr.PivotTable src/main/resources/pivotdata.txt src/main/resources/output
*
*/
package com.dvidr;
View reservoir-sampling.py
#!/usr/bin/python
import random
shuf = random.sample(range(100000), 1000)
# Keep bag of k=10 elements
# Prefill the bag with first shuf items
bag = shuf[:10]
View time-dependent-graphs.py
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib
# -------------- Get Data/Graphs -------------
with open('graph-data/g1.txt', 'r') as g:
g1 = map(lambda x: eval(x), g.readlines())
with open('graph-data/g2.txt', 'r') as g:
View spike.py
from __future__ import print_function
"""
Using OpenDNS domain query activity, we retrieve 5 days
of queries/hour to a domain for 240+ domains (stored
in dns.json). We predict the number of queries in
the next hour using a LSTM recurrent neural network.
An ad hoc anomaly detection is outlined in the final
for loop.
View AliceInGraphXLand.scala
package com.dvidr
import org.apache.spark.graphx.{VertexRDD, Edge, Graph}
import org.apache.spark.sql._
import org.apache.spark.{SparkConf, SparkContext}
import scala.util.Random
case class Chat(id: Int, name: String, talk: String)
case class ChatGraph(id: Int, dst: Int, replyIn: Int, name: String, talk: String)
case class TopChat(id: Int, name: String, talk: String, inDeg: Int, outDeg: Int)
View AliceInTwitterLand.scala
package com.dvidr
import com.twitter.algebird.{Moments, Aggregator}
import scala.util.Random
/*
Please refer to AliceInAggregatorLand first:
https://gist.github.com/johnynek/814fc1e77aad1d295bb7
This is an adaption where "Alice In Wonderland" is turned
View MyMonoid.scala
package com.dvidr.storm.bolt
import com.twitter.algebird.Monoid
class MyMonoid[K, V <: Numeric] extends Monoid[Map[K, V]] {
override def zero = Map[K, V ]()
override def plus(x: Map[K, V], y: Map[K, V]): Map[K, V] = {
val list = x.toList ++ y.toList
return list.groupBy ( _._1) .map { case (k,v) => k -> v.map(_._2).sum }
}