Örjan Angré (Lundberg) oluies

## awk split on ;
echo "15;23;35#18;14;89" | tr '#' '\012' | awk -F';' '{for (i=1;i<=NF;i++){print NR, i, $(i)} }'

## Example_HIVE_and_SparkSQL.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                oluies
                / Example_HIVE_and_SparkSQL.ipynb
            
            
              Created
              December 2, 2016 10:17
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## IPv4ToLong.scala
import java.net.InetAddress
def IPv4ToLong(dottedIP: String): Long = {
  val addrArray: Array[String] = dottedIP.split("\\.")
  var num: Long = 0
  var i: Int = 0
  while (i < addrArray.length) {
    val power: Int = 3 - i
    num = num + ((addrArray(i).toInt % 256) * Math.pow(256, power)).toLong
    i += 1
  }

## loggik.scala

import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
import  org.apache.hadoop.fs.{FileSystem,Path}

val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

 def getschemametod(): StructType = {
     StructType(
         Seq(

## gist:fd198ed80a4280f53574bca79b21d834
import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
import  org.apache.hadoop.fs.{FileSystem,Path}

val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._

def csvToDF(file: Path, delimiter : String,charset: String = "UTF8", useHeader: Boolean = true, schema: Option[StructType] = None) = {

  val df = schema match {
          case Some(schema) => sqlContext.read

## sessionisation_in_spark.py
sqlloggik4_df = """
SELECT *
  , CAST(id as BIGINT) *10000 + SUM(new_session)
    OVER (PARTITION BY id ORDER BY starttid)
     AS session_id
FROM(
SELECT *,
    unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY  l.starttid) timesincelast,
    CASE
    WHEN unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY  l.starttid)  >= 30 * 60

## checkSEPnr.scala
def checkSEPnr(pnr:String)  = {

  val chars = pnr.toList
  val removeMinus = chars.view.filter(_ != '-')
  val charToInt = removeMinus.view.map(_ - '0')
  val lunsum: Int = charToInt.take(9).foldLeft( (0,2) ){
    (r,c) =>
    (r._1 + (c * r._2) / 10 + (c * r._2) % 10, if (r._2 == 2) 1 else 2)
    }._1 % 10

## vectorsum.scala
import org.apache.spark.sql.Row
import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.{Vector, Vectors}

val t_df = sqlContext.read.parquet("/user/s89718/Pivoted_cust_weekday_total_with_Clusters.parquet")
val tm_df = t_df.select("IP_ID","assembled")
val emptyVector = BDV(Array.fill(7)(0.0))

val zeVector = tm_df
  .rdd

## sum_vector_spark.scala
import org.apache.spark.sql.Row
import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.{Vector, Vectors}

val t_df = sqlContext.read.parquet("/user/_/Pivoted_cust_weekday_total_with_Clusters.parquet")
val tm_df = t_df.select("IP_ID","assembled")
val emptyVector = DenseVector (Array.fill(7)(0.0))

val zeVector = tm_df
  .rdd

## spark_unix_timestamp.scala
val df =Seq((1L,"03JUN2015 19.28.00"),(2L,"#$@#@#")).toDF("id","dts")
import org.apache.spark.sql.functions.dayofmonth

import org.apache.spark.sql.functions.unix_timestamp

df.withColumn("ts", unix_timestamp($"dts","ddMMMyy HH.mm.ss").cast("timestamp"))
  .withColumn("dom", dayofmonth($"ts"))
  .withColumn("month", month($"ts"))
  .withColumn("yesar", year($"ts"))
  .show(2,false)
	import java.net.InetAddress
	def IPv4ToLong(dottedIP: String): Long = {
	val addrArray: Array[String] = dottedIP.split("\\.")
	var num: Long = 0
	var i: Int = 0
	while (i < addrArray.length) {
	val power: Int = 3 - i
	num = num + ((addrArray(i).toInt % 256) * Math.pow(256, power)).toLong
	i += 1
	}

	import org.apache.spark.sql.types.{DoubleType,LongType,ShortType, IntegerType, StructField,TimestampType, StructType,StringType,NumericType,BooleanType}
	import org.apache.hadoop.fs.{FileSystem,Path}

	val sqlContext = new org.apache.spark.sql.SQLContext(sc)
	import sqlContext.implicits._

	def getschemametod(): StructType = {
	StructType(
	Seq(
	sqlloggik4_df = """
	SELECT *
	, CAST(id as BIGINT) *10000 + SUM(new_session)
	OVER (PARTITION BY id ORDER BY starttid)
	AS session_id
	FROM(
	SELECT *,
	unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY l.starttid) timesincelast,
	CASE
	WHEN unix_timestamp(l.starttid) - LAG(unix_timestamp(l.starttid)) OVER (PARTITION BY l.id ORDER BY l.starttid) >= 30 * 60
	def checkSEPnr(pnr:String) = {

	val chars = pnr.toList
	val removeMinus = chars.view.filter(_ != '-')
	val charToInt = removeMinus.view.map(_ - '0')
	val lunsum: Int = charToInt.take(9).foldLeft( (0,2) ){
	(r,c) =>
	(r._1 + (c * r._2) / 10 + (c * r._2) % 10, if (r._2 == 2) 1 else 2)
	}._1 % 10
	import org.apache.spark.sql.Row
	import breeze.linalg.DenseVector
	import org.apache.spark.mllib.linalg.{Vector, Vectors}

	val t_df = sqlContext.read.parquet("/user/s89718/Pivoted_cust_weekday_total_with_Clusters.parquet")
	val tm_df = t_df.select("IP_ID","assembled")
	val emptyVector = BDV(Array.fill(7)(0.0))

	val zeVector = tm_df
	.rdd
	val df =Seq((1L,"03JUN2015 19.28.00"),(2L,"#$@#@#")).toDF("id","dts")
	import org.apache.spark.sql.functions.dayofmonth

	import org.apache.spark.sql.functions.unix_timestamp

	df.withColumn("ts", unix_timestamp($"dts","ddMMMyy HH.mm.ss").cast("timestamp"))
	.withColumn("dom", dayofmonth($"ts"))
	.withColumn("month", month($"ts"))
	.withColumn("yesar", year($"ts"))
	.show(2,false)