emesday / StringJoinBenchmark.scala
Last active August 28, 2016 02:59
package w20160827
case class Log(s: String, l: Long, f: String, t: String, p: String) {
def usingFixedSize: String = {
// 23 = Long.MaxValue.toString.length + ("\t" * 4).length
val size = 23 + s.length + f.length + t.length + p.length
val sb = new StringBuilder(size, s)
sb append "\t"; sb append l
sb append "\t"; sb append f
def topByKey(key: String, orderBy: String, n: Int): DataFrame = {
val keyIndex = df.schema.fieldIndex(key)
val orderByIndex = df.schema.fieldIndex(orderBy)
val ord = df.schema.fields(orderByIndex).dataType match {
case o: StringType =>[Row, String](_.getString(orderByIndex))
case o: IntegerType =>[Row, Int](_.getInt(orderByIndex))
case o: LongType =>[Row, Long](_.getLong(orderByIndex))
case o: FloatType =>[Row, Float](_.getFloat(orderByIndex))
case o: DoubleType =>[Row, Double](_.getDouble(orderByIndex))
case _ => throw new IllegalArgumentException
protected def typeToTypeTag[T](tpe: Type): TypeTag[T] = {
TypeTag(currentMirror, new TypeCreator {
override def apply[U <: Universe with Singleton](m: api.Mirror[U]): U#Type = {
protected def typeToClassTag[T](tpe: Type): ClassTag[T] = typeTagToClassTag(typeToTypeTag(tpe))
emesday / MapAccumulator.scala
Last active March 29, 2017 17:31
MapAccumulator for Spark 2.+
import org.apache.spark.util.AccumulatorV2
class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
var underlying = new scala.collection.mutable.HashMap[K, Long]
override def isZero: Boolean = underlying.isEmpty
override def copy(): MapAccumulator[K] = {
val newAcc = new MapAccumulator[K]
newAcc.underlying = this.underlying.clone()
emesday /
Created August 15, 2017 08:51
나무위키:데이터베이스 덤프 변환 - one json object per line
# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
# input: namuwiki_170327.json
# output: output.json
jq -nc --stream 'fromstream(1|truncate_stream(inputs))' namuwiki_170327.json > output.json
emesday /
Created December 27, 2017 08:46
read htk format
import numpy as np
with open('path/to/htk.feat', 'rb') as f:
a = np.fromfile(f, dtype='>i4', count = 2)
b = np.fromfile(f, dtype='>i2', count = 2)
fdim = int(b[0] / 4)
fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
nframes = fea.shape[0]
emesday /
Last active August 6, 2018 07:37
통계청 지역 정보
import pandas as pd
input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
sheet = input.parse(4, 1)
sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]
emesday /
Last active January 9, 2019 04:55
import math
class op:
def eval(self, values):
def grad(self, values, over):
emesday / Reservoir.scala
Last active August 11, 2020 17:15
Reservoir Sampling for Scala Spark
import scala.reflect.ClassTag
import scala.util.Random
class Reservoir[T: ClassTag](
private val size: Int,
private val seed: Long = Random.nextLong()) extends Serializable {
private val rand = new Random(seed)
private val reservoir = new Array[T](size)
private var count = 0L
emesday / TopByKeyAggregatorProxy.scala
Last active April 19, 2019 00:43
object TopByKeyAggregatorProxy {
import scala.reflect.runtime.universe._
* Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
* the top `num` K2 items based on the given Ordering.
def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
(num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {