Skip to content

Instantly share code, notes, and snippets.

@darkjh
darkjh / dict_path_access.py
Last active December 30, 2015 04:09
Access a nested dict element by its path, eg. `outer.middle.inner`
a = {'a': {'b': {'c': 1}}}
reduce(lambda d, key: d[key], l, a)
# >>> 1
@darkjh
darkjh / gist:9591855
Created March 17, 2014 00:19
Archlinux virtualbox kernal modules
sudo modprobe vboxnetadp
sudo modprobe vboxdrv
sudo modprobe vboxnetflt
type ParsedDoc = Map[String, Any]
type PartialDoc = (UID, ParsedDoc)
/**
* Recursively merge two parsed json documents
*/
private def merge(map1 : ParsedDoc, map2 : ParsedDoc): ParsedDoc = {
def mergeValues(o1 : Option[Any], o2 : Option[Any]) =
(o1, o2) match {
case (Some(v1 : ParsedDoc), Some(v2 : ParsedDoc)) => merge(v1, v2)
//Adapted from: https://github.com/jcrobak/avro-examples
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.avro.generic.GenericRecord
import org.apache.avro.mapred.AvroKey
import org.apache.avro.mapreduce.AvroKeyInputFormat
import org.apache.hadoop.io.NullWritable
import org.apache.commons.lang.StringEscapeUtils.escapeCsv
def copy[D <: RawData](inputPath: String, outputPath: String,
splitsNum: Int = 128, replication: Int = 2)
(implicit sc: SparkContext, ct: ClassTag[D]): Unit = {
// use reflection to
// - get the ctor
// - get the canonical name tagged in annotation
val cls = ct.runtimeClass
val ctor = (args: Array[String]) =>
cls.getConstructor(classOf[Array[String]]).newInstance(args)
val canonicalName = cls.getAnnotation(classOf[CanonicalName]).name()
import dpark
lines = dpark.textFile('./bible.txt', numSplits=4)
lines = lines.map(lambda line: line.strip())
words = lines.flatMap(lambda line: line.split()).map(lambda x: (x, 1))
wc = words.reduceByKey(lambda x, y: x + y)
wc.saveAsCSVFile('/tmp/dpark_result/', dialect='excel-tab', compress=True)
from cdf.utils.kvstore import LevelDB
import struct
class LevelDBExternalSort(object):
SEP = '\0'
FMT = '>i'
def __init__(self, tmp_dir=None, **configs):
if tmp_dir is None:
import dpark
import logging
import os
import re
logging.basicConfig(level=logging.DEBUG)
def list_files(dirpath, full_path=True, regexp=None):
import java.lang.reflect.ParameterizedType
import com.google.common.reflect.TypeToken
import scala.reflect.ClassTag
import scala.reflect.runtime.universe._
class KV[I, O]