Ryan Deak deaktator

## type_lambda.scala
// Also see: http://ktoso.github.io/scala-types-of-types/#type-lambda-span-style-color-red-span

import scala.language.higherKinds

//
// Notice Monad has one unbound type parameter.
//
trait Monad[M[_]] {
  def point[A](a: A): M[A]
  def map[A, B](m: M[A])(f: A => B): M[B] = flatMap(m)(f.andThen(point(_)))

## none_count.scala
// The following is useful for counting data that is not filled in.
// Can easily be adapted to Spark by simply changing nonNoneFieldProbs to use Spark's aggregate.


/**
 * Recursively determine which fields are Some.
 *
 * Uses Java reflection to determine field names and Scala product iterators to
 * determine field values.
 * @param value a value that's a product.

## cats_eithert_short_circuit_fold.scala
// Example of a short-circuiting fold using EitherT
//
// I don't just want a simple find, I want to do retry logic given a bunch of candidates.
// Is there something better than the ApplicativeError for EitherT?
// If you can either of a better way to do something like `find`, please tweet @deaktator.

import cats.{Foldable, Id}
import cats.data.{EitherT, NonEmptyList}
import cats.effect.IO
import cats.instances.list.catsStdInstancesForList

## sklearn_scorer_wt_invariant.py
# Author:   Ryan Deak
# Date:     Feb 21, 2019
# Purpose:  Determine whether all scikit-learn metrics that are listed in
#   ``sklearn.metrics.SCORERS`` are scale invariant, meaning the same result
#   is produced given ``sample_weights`` or ``sample_weights`` multiplied by
#   ANY positive constant.  This is done via probabilistic proof.
#
#   This helps to prove a proposed reformulation of importance weighting.
#
#   NOTE: According to the test, scorers in ``non_imp_wt_metrics`` may be

## sklearn_scorer_wt_invariant.py
# Author:   Ryan Deak
# Date:     Feb 21, 2019
# Purpose:  Determine whether all scikit-learn metrics that are listed in
#   ``sklearn.metrics.SCORERS`` are scale invariant, meaning the same result
#   is produced given ``sample_weights`` or ``sample_weights`` multiplied by
#   ANY positive constant.  This is done via probabilistic proof.
#
#   This helps to prove a proposed reformulation of importance weighting.
#
#   NOTE: According to the test, scorers in ``non_imp_wt_metrics`` may be

## buggy_key_fn.py
from typing import Tuple
from numpy import floor

# Implementation note: In Python, use numpy's floor because math.floor doesn't
#   gracefully handle NaN since math.floor's codomain is Int.  Therefore, flooring
#   NaN using the standard library raises a ValueError.

def correct_key_fn(f: float) -> Tuple[int, float]:
    # Notice work can be done in the key fn.  If it's "pure", it could be memoized.
    return (floor(f), f - floor(f))

## sklearn_test_sample_weight_cross_validation.py
import numpy as np
import pytest
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, brier_score_loss, log_loss, precision_score, make_scorer
from sklearn.model_selection import GridSearchCV

@pytest.mark.parametrize("metric,greater_is_better,needs_proba,sample_wt", [
    (accuracy_score, True, False, [1, 999999, 1, 999999]),
    (accuracy_score, True, False, [100000, 200000, 100000, 200000]),
    (accuracy_score, True, False, [100000, 100000, 100000, 100000]),

## hypergeom_sklearn_imp_wt.py
from scipy.stats import hypergeom

# Run the following to see the graph:
#     for i in range(1, 400):
#         print(f"{i}\t{exp_test_acc(400, 201, i)}")
#
def exp_test_acc(pop_size, pos_in_pop, fold1_size):
    fold2_size = pop_size - fold1_size
    E_acc = 0.0

## wt_cv_eval_is_diff_unwt_cv.py
# ============================================================================
#  R.M. Deak                                    wt_cv_eval_is_diff_unwt_cv.py
#
#  Runs scikit-learn's cross validation with GridSearchCV and shows that
#  different optimal parameter values may be returned by GridSearchCV than
#  when using cross validation with sample_weights passed to the scoring
#  function.
# ============================================================================

from typing import NamedTuple

## const_approx_1.scala
def something(n: Long, k: Long): BigDecimal = {
  @scala.annotation.tailrec
  def h(ni: Long, ki: Long, total: BigDecimal): BigDecimal = {
    if (ki <= 0) total
    else {
      val p = BigDecimal.exact(ki) / ni
      val ap = p / k
      h(ni - 1, ki - 1, total + ap)
    }
  }
	// Also see: http://ktoso.github.io/scala-types-of-types/#type-lambda-span-style-color-red-span

	import scala.language.higherKinds

	//
	// Notice Monad has one unbound type parameter.
	//
	trait Monad[M[_]] {
	def point[A](a: A): M[A]
	def map[A, B](m: M[A])(f: A => B): M[B] = flatMap(m)(f.andThen(point(_)))
	// The following is useful for counting data that is not filled in.
	// Can easily be adapted to Spark by simply changing nonNoneFieldProbs to use Spark's aggregate.


	/**
	* Recursively determine which fields are Some.
	*
	* Uses Java reflection to determine field names and Scala product iterators to
	* determine field values.
	* @param value a value that's a product.
	// Example of a short-circuiting fold using EitherT
	//
	// I don't just want a simple find, I want to do retry logic given a bunch of candidates.
	// Is there something better than the ApplicativeError for EitherT?
	// If you can either of a better way to do something like `find`, please tweet @deaktator.

	import cats.{Foldable, Id}
	import cats.data.{EitherT, NonEmptyList}
	import cats.effect.IO
	import cats.instances.list.catsStdInstancesForList
	# Author: Ryan Deak
	# Date: Feb 21, 2019
	# Purpose: Determine whether all scikit-learn metrics that are listed in
	# ``sklearn.metrics.SCORERS`` are scale invariant, meaning the same result
	# is produced given ``sample_weights`` or ``sample_weights`` multiplied by
	# ANY positive constant. This is done via probabilistic proof.
	#
	# This helps to prove a proposed reformulation of importance weighting.
	#
	# NOTE: According to the test, scorers in ``non_imp_wt_metrics`` may be
	from typing import Tuple
	from numpy import floor

	# Implementation note: In Python, use numpy's floor because math.floor doesn't
	# gracefully handle NaN since math.floor's codomain is Int. Therefore, flooring
	# NaN using the standard library raises a ValueError.

	def correct_key_fn(f: float) -> Tuple[int, float]:
	# Notice work can be done in the key fn. If it's "pure", it could be memoized.
	return (floor(f), f - floor(f))
	import numpy as np
	import pytest
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score, brier_score_loss, log_loss, precision_score, make_scorer
	from sklearn.model_selection import GridSearchCV

	@pytest.mark.parametrize("metric,greater_is_better,needs_proba,sample_wt", [
	(accuracy_score, True, False, [1, 999999, 1, 999999]),
	(accuracy_score, True, False, [100000, 200000, 100000, 200000]),
	(accuracy_score, True, False, [100000, 100000, 100000, 100000]),
	from scipy.stats import hypergeom

	# Run the following to see the graph:
	# for i in range(1, 400):
	# print(f"{i}\t{exp_test_acc(400, 201, i)}")
	#
	def exp_test_acc(pop_size, pos_in_pop, fold1_size):
	fold2_size = pop_size - fold1_size
	E_acc = 0.0
	# ============================================================================
	# R.M. Deak wt_cv_eval_is_diff_unwt_cv.py
	#
	# Runs scikit-learn's cross validation with GridSearchCV and shows that
	# different optimal parameter values may be returned by GridSearchCV than
	# when using cross validation with sample_weights passed to the scoring
	# function.
	# ============================================================================

	from typing import NamedTuple
	def something(n: Long, k: Long): BigDecimal = {
	@scala.annotation.tailrec
	def h(ni: Long, ki: Long, total: BigDecimal): BigDecimal = {
	if (ki <= 0) total
	else {
	val p = BigDecimal.exact(ki) / ni
	val ap = p / k
	h(ni - 1, ki - 1, total + ap)
	}
	}