Skip to content

Instantly share code, notes, and snippets.

View rzykov's full-sized avatar

Roman Zykov rzykov

View GitHub Profile
@rzykov
rzykov / XgBoostRankSparkScala.scala
Last active March 31, 2022 22:43
XGboost Spark - ranking problem
import _root_.ml.dmlc.xgboost4j.scala.spark.XGBoost
import org.apache.spark.ml.feature.LabeledPoint
def encodeFeaturesToLabeledPoint(features: RDD[Feature], relevance: Option[RDD[Relevance]], workers: Int)
(implicit parallel: Int): (RDD[LabeledPoint], Seq[String], Seq[Seq[Int]]) = {
val missingValue = Double.NaN
val names = features
.map { _.name }
package org.deeplearning4j.learn
import java.util.Arrays
import org.nd4j.api.linalg.DSL._
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.factory.Nd4j
import org.nd4j.linalg.indexing.{NDArrayIndex, BooleanIndexing}
import org.nd4j.linalg.indexing.conditions.Conditions
import org.nd4j.linalg.indexing.functions.Value
@rzykov
rzykov / Nd4j_lstm.scala
Created August 21, 2017 08:51 — forked from harperjiang/Nd4j_lstm.scala
Performance comparison of numpy vs nd4j on LSTM implementation
import org.nd4j.linalg.api.ndarray.INDArray
import org.nd4j.linalg.api.ops.impl.broadcast.BroadcastAddOp
import org.nd4j.linalg.api.rng.distribution.impl.UniformDistribution
import org.nd4j.linalg.factory.Nd4j
import scala.util.Random
object Xavier {
def init(shape: Array[Int]): INDArray = {
var n = shape.dropRight(1).product
@rzykov
rzykov / DataAnalysisIntro.scala
Last active September 9, 2021 12:42
Data analysis using Scala
//CODE:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.tribbloid.ispark.display.dsl._
import scala.util.Try
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
@rzykov
rzykov / DataAnalysisIntro1.scala
Last active October 6, 2021 11:35
Data Analysis with Scala - medium
//CODE:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.tribbloid.ispark.display.dsl._
import scala.util.Try
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
@rzykov
rzykov / DataAnalysisIntro2.scala
Last active October 6, 2021 12:41
DataAnalysisIntro2.scala
//CODE:
//The most popular category.
dataAov.map { x => x.categoryId } // select the categoryId field
.countByValue() // calculate how often each categoryId appears
.toSeq
.sortBy( - _._2) // sort by frequency in descending order
.take(10) //take the top 10 records
//OUT:
//format: (categoryId, count)
@rzykov
rzykov / DataAnalysisIntro3.scala
Last active October 6, 2021 12:48
DataAnalysisIntro3.scala
//CODE:
val interestedBrowsers = List("Android", "OS X", "iOS", "Linux", "Windows")
val osAov = dataAov.filter(x => interestedBrowsers.contains(x.osFamily)) //we leave only the desired OS
.filter(_.categoryId == 128) // filter categories
.map(x => (x.osFamily, (x.aov, 1.0))) // need to calculate average purchase amount
.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
.map{ case(osFamily, (revenue, orders)) => (osFamily, revenue/orders) }
.collect()
//OUT
@rzykov
rzykov / DataAnalysisIntro4.scala
Created October 6, 2021 11:32
DataAnalysisIntro4.scala
//CODE of Highcharts
import com.quantifind.charts.Highcharts._
import ru.retailrocket.ispark.wisp._
draw(column(osAov.toList))
@rzykov
rzykov / DataAnalysisIntro5.scala
Created October 6, 2021 11:33
DataAnalysisIntro5.scala
//CODE
import org.ddahl.rscala._
import ru.retailrocket.ispark._
def connect() = RClient("R", false)
@transient
val r = connect()
R.plot(r, "barplot(cs, names.arg=names, col = 'red' )", Map("cs"->os.Aov.map(_._2), "names" -> osAov.map(_._1)) )
@rzykov
rzykov / DataAnalysisIntro6.scala
Created October 6, 2021 11:34
DataAnalysisIntro6.scala
//CODE of MIC
import data.VarPairData
import mine.core.MineParameters
import analysis.Analysis
import analysis.results.BriefResult
import scala.util.Random
//Code a discrete value by randomly changing the order of the "codes
def encode(col: Array[String]): Array[Double] = {