K. kaja47

## gist:554f62c61f21b0420720
// min-hash

val fs: Vector[Int => Int] // hash funkce

items map { it => fs map { f => f(it) } } fold (vectorPairwise(min), initialValue = Vector.fill(infinity))


// HyperLogLog

## csfd-crawler.php
<?php

use Atrox\Matcher;
use Atrox\Curl;
use Atrox\Async;

$userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [
  'url'      => Matcher::single('td/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
  'points'   => Matcher::single('td[3]')->asInt(),
  'ratings'  => Matcher::single('td[4]')->asInt(),

## rachel-riley.scala
sealed trait Tree {
  def eval: Option[Int]
}

case class Leaf(n: Int) extends Tree {
  override def toString = n.toString
  def eval = Some(n)
}

case class Node(op: String, l: Tree, r: Tree) extends Tree {

## combinations.scala
// genrate all combinations of integers in range from 0 to `len`-1
// fast as fuck
def combIdxs(len: Int, k: Int): Iterator[Array[Int]] = {
  val arr = Array.range(0, k)
  arr(k-1) -= 1
  val end = k-1

  Iterator.continually {
    arr(end) += 1
    if (arr(end) >= len) {

## change-count.php
<?php

function countChange($amount) {
  return cc($amount, 5);
}

function cc($amount, $kindOfCoins) {
  if ($amount === 0) return 1;
  if ($amount < 0) return 0;
  if ($kindOfCoins === 0) return 0;

## pearson.scala
import breeze.linalg._

def corr(a: DenseVector[Double], b: DenseVector[Double]): Double = {
  if (a.length != b.length)
    sys.error("you fucked up")

  val n = a.length

  val (amean, avar) = meanAndVariance(a)
  val (bmean, bvar) = meanAndVariance(b)

## promises-promises.php
<?php

// generators are weaker than monads and this is therefore not possible
// CPS transformation to state machines sucks balls

$usersWhoCommentedOnTheirPosts = run($seqMonad, function () {
  $user = (yield allUsers());
  $post = (yield $user->posts);
  $comm = (yield $post->comments);
  if ($comm->author === $comm->author)

## gist:5954091

      
              1 file
            
          
              0 forks
            
          
              3 comments
            
          
              0 stars
            
          
                kaja47
                / gist:5954091
            
            
              Last active
              December 19, 2015 12:19
            
              
                Vysvětlení tohoto: https://gist.github.com/kaja47/5944287
              
          
    Můj problém je jednoduchý: mám pole doublů a potřebuji top-k hodnot a jejich indexů.
Top-k vrátí k největších elementů, které jsou větší než nějaké minimum. Funkce topKEstaimateMin velice rychle odhadne tohle  minimum (jednou sekvenčně projde pole a ke cache se chová naprosto ideálně). Odhad nebude skutečné minimum, ale bude o něco    menší, tedy vybere víc než k elementů. Ale zároveň garantuje, že nebude větší něž skutečné minimum. Je to tedy dolní mez pro minimum.
Pracuje tak, že rozhazuje elementy do k bucketů (prakticky je to nejbližší mocnina dvou větší než k, protože pak můžu      použít maskování bitů, které je mnohem rychlejší než modulo) a počítá jejich maxima. Z těchto maxim pak vezmu minimum, u       kterého je garantováno, že je menší nebo rovno než k (nebo více) čísel ze zdrojového pole a v ideálním je poměrně blízko     skutečného minima (v mých testech se ukazuje, že pro n=340000 a k=1000, to vrátí minimum pro 6000 čísel, tedy odfiltruje 98%   hodnot).
Dá se to dělat taky obráceně,

  
## top-k-magic.scala
  def topKEstaimateMin(arr: Array[Double], k: Int): Double = {
    import java.lang.Integer
    val upperK = Integer.highestOneBit(k) * 2 // must be higher than k, otherwise can produce wrong minimum
    val bits = Integer.numberOfTrailingZeros(upperK)
    val mask = (1 << bits) - 1
    var i = 0
    val maximums = new Array[Double](upperK)
    while (i < arr.length) {
      val pos = i & mask
      val m = maximums(pos)

## cache-localtiy-son.scala
final class SegmentSumBuffer(val size: Int) {

  private val segmentBits = 12 // 4096 elemets per segmen (n * 8 bytes must fit in L1 cache)
  private val segments = size / (1 << segmentBits) + 1
  private val segmentSize = 8192 // arbitrary number

  private val keySegments = Array.fill(segments) { new Array[Int](segmentSize) }
  private val valSegments = Array.fill(segments) { new Array[Double](segmentSize) }
  private val positions = new Array[Int](segments) // next pos
	// min-hash

	val fs: Vector[Int => Int] // hash funkce

	items map { it => fs map { f => f(it) } } fold (vectorPairwise(min), initialValue = Vector.fill(infinity))



	// HyperLogLog
	<?php

	use Atrox\Matcher;
	use Atrox\Curl;
	use Atrox\Async;

	$userListMatcher = Matcher::multi('//table[@class="ui-table-list"]//tr', (object) [
	'url' => Matcher::single('td/a/@href')->map(function ($x) { return "http://www.csfd.cz$x"; }),
	'points' => Matcher::single('td[3]')->asInt(),
	'ratings' => Matcher::single('td[4]')->asInt(),
	sealed trait Tree {
	def eval: Option[Int]
	}

	case class Leaf(n: Int) extends Tree {
	override def toString = n.toString
	def eval = Some(n)
	}

	case class Node(op: String, l: Tree, r: Tree) extends Tree {
	// genrate all combinations of integers in range from 0 to `len`-1
	// fast as fuck
	def combIdxs(len: Int, k: Int): Iterator[Array[Int]] = {
	val arr = Array.range(0, k)
	arr(k-1) -= 1
	val end = k-1

	Iterator.continually {
	arr(end) += 1
	if (arr(end) >= len) {
	<?php

	function countChange($amount) {
	return cc($amount, 5);
	}

	function cc($amount, $kindOfCoins) {
	if ($amount === 0) return 1;
	if ($amount < 0) return 0;
	if ($kindOfCoins === 0) return 0;
	import breeze.linalg._

	def corr(a: DenseVector[Double], b: DenseVector[Double]): Double = {
	if (a.length != b.length)
	sys.error("you fucked up")

	val n = a.length

	val (amean, avar) = meanAndVariance(a)
	val (bmean, bvar) = meanAndVariance(b)
	<?php

	// generators are weaker than monads and this is therefore not possible
	// CPS transformation to state machines sucks balls

	$usersWhoCommentedOnTheirPosts = run($seqMonad, function () {
	$user = (yield allUsers());
	$post = (yield $user->posts);
	$comm = (yield $post->comments);
	if ($comm->author === $comm->author)
	def topKEstaimateMin(arr: Array[Double], k: Int): Double = {
	import java.lang.Integer
	val upperK = Integer.highestOneBit(k) * 2 // must be higher than k, otherwise can produce wrong minimum
	val bits = Integer.numberOfTrailingZeros(upperK)
	val mask = (1 << bits) - 1
	var i = 0
	val maximums = new Array[Double](upperK)
	while (i < arr.length) {
	val pos = i & mask
	val m = maximums(pos)
	final class SegmentSumBuffer(val size: Int) {

	private val segmentBits = 12 // 4096 elemets per segmen (n * 8 bytes must fit in L1 cache)
	private val segments = size / (1 << segmentBits) + 1
	private val segmentSize = 8192 // arbitrary number

	private val keySegments = Array.fill(segments) { new Array[Int](segmentSize) }
	private val valSegments = Array.fill(segments) { new Array[Double](segmentSize) }
	private val positions = new Array[Int](segments) // next pos