Skip to content

Instantly share code, notes, and snippets.

@emesday
emesday / StringJoinBenchmark.scala
Last active August 28, 2016 02:59
StringJoinBenchmark
package w20160827
case class Log(s: String, l: Long, f: String, t: String, p: String) {
def usingFixedSize: String = {
// 23 = Long.MaxValue.toString.length + ("\t" * 4).length
val size = 23 + s.length + f.length + t.length + p.length
val sb = new StringBuilder(size, s)
sb append "\t"; sb append l
sb append "\t"; sb append f
def topByKey(key: String, orderBy: String, n: Int): DataFrame = {
val keyIndex = df.schema.fieldIndex(key)
val orderByIndex = df.schema.fieldIndex(orderBy)
val ord = df.schema.fields(orderByIndex).dataType match {
case o: StringType => Ordering.by[Row, String](_.getString(orderByIndex))
case o: IntegerType => Ordering.by[Row, Int](_.getInt(orderByIndex))
case o: LongType => Ordering.by[Row, Long](_.getLong(orderByIndex))
case o: FloatType => Ordering.by[Row, Float](_.getFloat(orderByIndex))
case o: DoubleType => Ordering.by[Row, Double](_.getDouble(orderByIndex))
case _ => throw new IllegalArgumentException
protected def typeToTypeTag[T](tpe: Type): TypeTag[T] = {
TypeTag(currentMirror, new TypeCreator {
override def apply[U <: Universe with Singleton](m: api.Mirror[U]): U#Type = {
tpe.asInstanceOf[U#Type]
}
})
}
protected def typeToClassTag[T](tpe: Type): ClassTag[T] = typeTagToClassTag(typeToTypeTag(tpe))
@emesday
emesday / MapAccumulator.scala
Last active March 29, 2017 17:31
MapAccumulator for Spark 2.+
import org.apache.spark.util.AccumulatorV2
class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
var underlying = new scala.collection.mutable.HashMap[K, Long]
override def isZero: Boolean = underlying.isEmpty
override def copy(): MapAccumulator[K] = {
val newAcc = new MapAccumulator[K]
newAcc.underlying = this.underlying.clone()
@emesday
emesday / json-array-to-json-object-per-line.sh
Created August 15, 2017 08:51
나무위키:데이터베이스 덤프 변환 - one json object per line
# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
# input: namuwiki_170327.json
# output: output.json
jq -nc --stream 'fromstream(1|truncate_stream(inputs))' namuwiki_170327.json > output.json
@emesday
emesday / read_htk_format.py
Created December 27, 2017 08:46
read htk format
import numpy as np
with open('path/to/htk.feat', 'rb') as f:
a = np.fromfile(f, dtype='>i4', count = 2)
b = np.fromfile(f, dtype='>i2', count = 2)
fdim = int(b[0] / 4)
fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
nframes = fea.shape[0]
@emesday
emesday / loc_to_csv.py
Last active August 6, 2018 07:37
통계청 지역 정보
# http://kssc.kostat.go.kr/ksscNew_web/kssc/common/CommonBoardList.do?gubun=1&strCategoryNameCode=019&strBbsId=kascrr&categoryMenu=014
import pandas as pd
input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
sheet = input.parse(4, 1)
sheet.to_csv("loc.csv")
sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]
@emesday
emesday / autograd.py
Last active January 9, 2019 04:55
autograd.py
import math
class op:
def eval(self, values):
pass
def grad(self, values, over):
pass
@emesday
emesday / Reservoir.scala
Last active August 11, 2020 17:15
Reservoir Sampling for Scala Spark
import scala.reflect.ClassTag
import scala.util.Random
class Reservoir[T: ClassTag](
private val size: Int,
private val seed: Long = Random.nextLong()) extends Serializable {
private val rand = new Random(seed)
private val reservoir = new Array[T](size)
private var count = 0L
@emesday
emesday / TopByKeyAggregatorProxy.scala
Last active April 19, 2019 00:43
TopByKeyAggregatorProxy.scala
object TopByKeyAggregatorProxy {
import scala.reflect.runtime.universe._
/**
* Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
* the top `num` K2 items based on the given Ordering.
*/
def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
(num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {
Class.forName("org.apache.spark.ml.recommendation.TopByKeyAggregator")
.getConstructors