emes emesday

## using-pyspark-and-scala-spark-together.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                emesday
                / using-pyspark-and-scala-spark-together.md
            
            
              Created
              July 20, 2020 06:11
            
              
                 Using PySpark and Scala Spark Together
              
          
Prepare Scala code and package it (may assembly).
Launch PySpark with --jars the-above.jar
Implement PySpark code and call Scala code by spark._jvm.your.code or udf
Exchange dataset by 'createOrReplaceTempView'


## oauth_callback.py
import requests
from flask import Flask, request

app = Flask(__name__)

db = {
    'credentials': {
        '<client_id1>': '<client_secret1>',
        '<client_id2>': '<client_secret2>'
    }

## LocalBinaryClassificationMetrics.scala
package org.apache.spark.mllib.evaluation

import org.apache.spark.mllib.evaluation.binary._

import scala.collection.mutable

class LocalBinaryClassificationMetrics(
  val scoreAndLabels: Seq[(Double, Double)],
  val numBins: Int) {

## TopByKeyAggregatorProxy.scala
object TopByKeyAggregatorProxy {
  import scala.reflect.runtime.universe._
  /**
    * Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
    * the top `num` K2 items based on the given Ordering.
    */
  def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
  (num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {
    Class.forName("org.apache.spark.ml.recommendation.TopByKeyAggregator")
      .getConstructors

## Reservoir.scala
import scala.reflect.ClassTag
import scala.util.Random

class Reservoir[T: ClassTag](
  private val size: Int,
  private val seed: Long = Random.nextLong()) extends Serializable {

  private val rand = new Random(seed)
  private val reservoir = new Array[T](size)
  private var count = 0L

## autograd.py
import math


class op:
    def eval(self, values):
        pass

    def grad(self, values, over):
        pass

## loc_to_csv.py
# http://kssc.kostat.go.kr/ksscNew_web/kssc/common/CommonBoardList.do?gubun=1&strCategoryNameCode=019&strBbsId=kascrr&categoryMenu=014

import pandas as pd

input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
sheet = input.parse(4, 1)
sheet.to_csv("loc.csv")

sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]

## read_htk_format.py
import numpy as np

with open('path/to/htk.feat', 'rb') as f:
    a = np.fromfile(f, dtype='>i4', count = 2)
    b = np.fromfile(f, dtype='>i2', count = 2)
    fdim = int(b[0] / 4)
    fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
    nframes = fea.shape[0]

## json-array-to-json-object-per-line.sh
# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
# input: namuwiki_170327.json
# output: output.json

jq -nc --stream 'fromstream(1|truncate_stream(inputs))' namuwiki_170327.json > output.json


## MapAccumulator.scala
import org.apache.spark.util.AccumulatorV2

class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
  var underlying = new scala.collection.mutable.HashMap[K, Long]

  override def isZero: Boolean = underlying.isEmpty

  override def copy(): MapAccumulator[K] = {
    val newAcc = new MapAccumulator[K]
    newAcc.underlying = this.underlying.clone()
	import requests
	from flask import Flask, request

	app = Flask(__name__)

	db = {
	'credentials': {
	'<client_id1>': '<client_secret1>',
	'<client_id2>': '<client_secret2>'
	}
	package org.apache.spark.mllib.evaluation

	import org.apache.spark.mllib.evaluation.binary._

	import scala.collection.mutable

	class LocalBinaryClassificationMetrics(
	val scoreAndLabels: Seq[(Double, Double)],
	val numBins: Int) {
	object TopByKeyAggregatorProxy {
	import scala.reflect.runtime.universe._
	/**
	* Works on rows of the form (K1, K2, V) where K1 & K2 are IDs and V is the score value. Finds
	* the top `num` K2 items based on the given Ordering.
	*/
	def asTypedColumn[K1: TypeTag, K2: TypeTag, V: TypeTag]
	(num: Int, ord: Ordering[(K2, V)]): TypedColumn[(K1, K2, V), Array[(K2, V)]] = {
	Class.forName("org.apache.spark.ml.recommendation.TopByKeyAggregator")
	.getConstructors
	import scala.reflect.ClassTag
	import scala.util.Random

	class Reservoir[T: ClassTag](
	private val size: Int,
	private val seed: Long = Random.nextLong()) extends Serializable {

	private val rand = new Random(seed)
	private val reservoir = new Array[T](size)
	private var count = 0L
	import math


	class op:
	def eval(self, values):
	pass

	def grad(self, values, over):
	pass
	# http://kssc.kostat.go.kr/ksscNew_web/kssc/common/CommonBoardList.do?gubun=1&strCategoryNameCode=019&strBbsId=kascrr&categoryMenu=014

	import pandas as pd

	input = pd.ExcelFile("한국행정구역분류_2018.7.1.기준_최종.xls")
	sheet = input.parse(4, 1)
	sheet.to_csv("loc.csv")

	sheet["label"] = sheet["시도"] + "_" + sheet["시군구"]
	sheet["text"] = sheet["시도"] + " " + sheet["시군구"] + " " + sheet["행정구역명"] + " " + sheet["행정동"] + " " + sheet["법정동"]
	import numpy as np

	with open('path/to/htk.feat', 'rb') as f:
	a = np.fromfile(f, dtype='>i4', count = 2)
	b = np.fromfile(f, dtype='>i2', count = 2)
	fdim = int(b[0] / 4)
	fea = np.fromfile(f, dtype='>f4').reshape(-1, fdim)
	nframes = fea.shape[0]
	# namuwiki_170327기준 나무위키 덤프는 Json Array로 저장되어 분산 처리를 할 수 없음
	# 분산 처리를 위해서 한 라인에 하나의 Json Object로 저장할 필요가 있음
	# input: namuwiki_170327.json
	# output: output.json

	jq -nc --stream 'fromstream(1\|truncate_stream(inputs))' namuwiki_170327.json > output.json
	import org.apache.spark.util.AccumulatorV2

	class MapAccumulator[K] extends AccumulatorV2[Map[K, Long], Map[K, Long]] {
	var underlying = new scala.collection.mutable.HashMap[K, Long]

	override def isZero: Boolean = underlying.isEmpty

	override def copy(): MapAccumulator[K] = {
	val newAcc = new MapAccumulator[K]
	newAcc.underlying = this.underlying.clone()