emes emesday

## StringJoinBenchmark.scala
package w20160827

case class Log(s: String, l: Long, f: String, t: String, p: String) {

  def usingFixedSize: String = {
    // 23 = Long.MaxValue.toString.length + ("\t" * 4).length
    val size = 23 + s.length + f.length + t.length + p.length
    val sb = new StringBuilder(size, s)
    sb append "\t"; sb append l
    sb append "\t"; sb append f

## topByKey.scala
  def topByKey(key: String, orderBy: String, n: Int): DataFrame = {
    val keyIndex = df.schema.fieldIndex(key)
    val orderByIndex = df.schema.fieldIndex(orderBy)
    val ord = df.schema.fields(orderByIndex).dataType match {
      case o: StringType => Ordering.by[Row, String](_.getString(orderByIndex))
      case o: IntegerType => Ordering.by[Row, Int](_.getInt(orderByIndex))
      case o: LongType => Ordering.by[Row, Long](_.getLong(orderByIndex))
      case o: FloatType => Ordering.by[Row, Float](_.getFloat(orderByIndex))
      case o: DoubleType => Ordering.by[Row, Double](_.getDouble(orderByIndex))
      case _ => throw new IllegalArgumentException

## type.scala
  protected def typeToTypeTag[T](tpe: Type): TypeTag[T] = {
    TypeTag(currentMirror, new TypeCreator {
      override def apply[U <: Universe with Singleton](m: api.Mirror[U]): U#Type = {
        tpe.asInstanceOf[U#Type]
      }
    })
  }

  protected def typeToClassTag[T](tpe: Type): ClassTag[T] = typeTagToClassTag(typeToTypeTag(tpe))

## MapAccumulator.scala
import org.apache.spark.util.AccumulatorV2

class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
  var underlying = new scala.collection.mutable.HashMap[K, Long]

  override def isZero: Boolean = underlying.isEmpty

  override def copy(): MapAccumulator[K] = {
    val newAcc = new MapAccumulator[K]
    newAcc.underlying = this.underlying.clone()

## json-array-to-json-object-per-line.sh
# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
# input: namuwiki_170327.json
# output: output.json

jq -nc --stream 'fromstream(1|truncate_stream(inputs))' namuwiki_170327.json > output.json


## read_htk_format.py
import numpy as np

with open('path/to/htk.feat', 'rb') as f:
    a = np.fromfile(f, dtype='>i4', count = 2)
    b = np.fromfile(f, dtype='>i2', count = 2)
    fdim = int(b[0] / 4)
    fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
    nframes = fea.shape[0]

## loc_to_csv.py
# http://kssc.kostat.go.kr/ksscNew_web/kssc/common/CommonBoardList.do?gubun=1&strCategoryNameCode=019&strBbsId=kascrr&categoryMenu=014

import pandas as pd

input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
sheet = input.parse(4, 1)
sheet.to_csv("loc.csv")

sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]

## autograd.py
import math


class op:
    def eval(self, values):
        pass

    def grad(self, values, over):
        pass

## Reservoir.scala
import scala.reflect.ClassTag
import scala.util.Random

class Reservoir[T: ClassTag](
  private val size: Int,
  private val seed: Long = Random.nextLong()) extends Serializable {

  private val rand = new Random(seed)
  private val reservoir = new Array[T](size)
  private var count = 0L

## TopByKeyAggregatorProxy.scala
object TopByKeyAggregatorProxy {
  import scala.reflect.runtime.universe._
  /**
    * Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
    * the top `num` K2 items based on the given Ordering.
    */
  def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
  (num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {
    Class.forName("org.apache.spark.ml.recommendation.TopByKeyAggregator")
      .getConstructors
	package w20160827

	case class Log(s: String, l: Long, f: String, t: String, p: String) {

	def usingFixedSize: String = {
	// 23 = Long.MaxValue.toString.length + ("\t" * 4).length
	val size = 23 + s.length + f.length + t.length + p.length
	val sb = new StringBuilder(size, s)
	sb append "\t"; sb append l
	sb append "\t"; sb append f
	def topByKey(key: String, orderBy: String, n: Int): DataFrame = {
	val keyIndex = df.schema.fieldIndex(key)
	val orderByIndex = df.schema.fieldIndex(orderBy)
	val ord = df.schema.fields(orderByIndex).dataType match {
	case o: StringType => Ordering.by[Row, String](_.getString(orderByIndex))
	case o: IntegerType => Ordering.by[Row, Int](_.getInt(orderByIndex))
	case o: LongType => Ordering.by[Row, Long](_.getLong(orderByIndex))
	case o: FloatType => Ordering.by[Row, Float](_.getFloat(orderByIndex))
	case o: DoubleType => Ordering.by[Row, Double](_.getDouble(orderByIndex))
	case _ => throw new IllegalArgumentException
	protected def typeToTypeTag[T](tpe: Type): TypeTag[T] = {
	TypeTag(currentMirror, new TypeCreator {
	override def apply[U <: Universe with Singleton](m: api.Mirror[U]): U#Type = {
	tpe.asInstanceOf[U#Type]
	}
	})
	}

	protected def typeToClassTag[T](tpe: Type): ClassTag[T] = typeTagToClassTag(typeToTypeTag(tpe))
	import org.apache.spark.util.AccumulatorV2

	class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
	var underlying = new scala.collection.mutable.HashMap[K, Long]

	override def isZero: Boolean = underlying.isEmpty

	override def copy(): MapAccumulator[K] = {
	val newAcc = new MapAccumulator[K]
	newAcc.underlying = this.underlying.clone()
	# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
	# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
	# input: namuwiki_170327.json
	# output: output.json

	jq -nc --stream 'fromstream(1\|truncate_stream(inputs))' namuwiki_170327.json > output.json
	import numpy as np

	with open('path/to/htk.feat', 'rb') as f:
	a = np.fromfile(f, dtype='>i4', count = 2)
	b = np.fromfile(f, dtype='>i2', count = 2)
	fdim = int(b[0] / 4)
	fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
	nframes = fea.shape[0]
	# http://kssc.kostat.go.kr/ksscNew_web/kssc/common/CommonBoardList.do?gubun=1&strCategoryNameCode=019&strBbsId=kascrr&categoryMenu=014

	import pandas as pd

	input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
	sheet = input.parse(4, 1)
	sheet.to_csv("loc.csv")

	sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
	sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]
	import math


	class op:
	def eval(self, values):
	pass

	def grad(self, values, over):
	pass
	import scala.reflect.ClassTag
	import scala.util.Random

	class Reservoir[T: ClassTag](
	private val size: Int,
	private val seed: Long = Random.nextLong()) extends Serializable {

	private val rand = new Random(seed)
	private val reservoir = new Array[T](size)
	private var count = 0L
	object TopByKeyAggregatorProxy {
	import scala.reflect.runtime.universe._
	/**
	* Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
	* the top `num` K2 items based on the given Ordering.
	*/
	def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
	(num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {
	Class.forName("org.apache.spark.ml.recommendation.TopByKeyAggregator")
	.getConstructors