tazjin/Algorithm1.hs

## Algorithm1.hs
module Algorithm1 where

import Countries
import Data.List
import System.Random

-- Get random country, weighing the population more
getCountry :: StdGen -> [Country] -> Country
getCountry gen unsorted = helper gen countries
  where
    countries = sort unsorted
    helper _ [c] = c
    helper g (x:xs) =
      let (y, g2) = random g :: (Bool, StdGen)
      in if y then x else helper g2 xs

## Algorithm2.hs
module Algorithm2 where

import Countries (Country)
import Data.List
import System.Random

-- Get random country, weighing the population more
getCountry :: StdGen -> [Country] -> Country
getCountry gen unsorted = helper gen countries
  where
    countries = sort unsorted
    helper :: StdGen -> [Country] -> Country
    helper _ [c] = c
    helper g (a:b:[]) = fst $ ratherFirstThanLast g (a, b)
    helper g l =
      let halves = halveList l
          (pick, g2) = ratherFirstThanLast g halves
      in helper g2 pick

-- Returns either the first or second element of the tuple,
-- but more likely the first than the second
ratherFirstThanLast :: StdGen -> (a, a) -> (a, StdGen)
ratherFirstThanLast gen (a, b) =
  -- Get a number from 0-2...
  let (n, newGen) = randomR (0,2) gen :: (Int, StdGen)
  -- then return first element with 2:1 probability
  in if n == 0 then (b, newGen) else (a, newGen)

-- Split a list in half
halveList :: [a] -> ([a], [a])
halveList l = splitAt (length l `div` 2) l

## Countries.hs
-- List of many countries with according population
module Countries where

newtype Country = Country { unCountry :: (String, Int) }
                deriving (Eq, Show)

-- Ordering by second tuple field
-- Unsound but pragmatic instance :-)
instance Ord Country where
  compare c1 c2 = compare (snd $ unCountry c2)
                          (snd $ unCountry c1)

countries :: [Country]
countries = map Country
  [("China", 1364410000)
  ,("India", 1244010000)
  ,("United States", 318030000)
  ,("Indonesia", 247424598)
  ,("Brazil", 202550000)
  ,("Pakistan", 186435000)
  ,("Nigeria", 173615000)
  ,("Bangladesh", 152518015)
  ,("Russia", 143700000)
  ,("Japan", 127140000)
  ,("Mexico", 119713203)
  ,("Philippines", 99554400)
  ,("Vietnam", 89708900)
  ,("Ethiopia", 86613986)
  ,("Egypt", 86446400)
  ,("Germany", 80716000)
  ,("Iran", 77427000)
  ,("Turkey", 76667864)
  ,("Democratic Republic of the Congo", 67514000)
  ,("France", 65885000)
  ,("Thailand", 64456700)
  ,("United Kingdom", 63705000)
  ,("Italy", 60021955)
  ,("Burma", 53259000)
  ,("South Africa", 52981991)
  ,("South Korea", 50219669)
  ,("Colombia", 47588000)
  ,("Spain", 46609700)
  ,("Ukraine", 45395604)
  ,("Tanzania", 44928923)
  ,("Kenya", 44354000)
  ,("Argentina", 42669500)
  ,("Algeria", 38700000)
  ,("Poland", 38496000)
  ,("Sudan", 37289406)
  ,("Uganda", 35357000)
  ,("Canada", 35344962)
  ,("Iraq", 34035000)
  ,("Morocco", 33257800)
  ,("Peru", 30475144)
  ,("Uzbekistan", 30183400)
  ,("Malaysia", 30113000)
  ,("Saudi Arabia", 29994272)
  ,("Venezuela", 28946101)
  ,("Nepal", 26494504)
  ,("Afghanistan", 25500100)
  ,("Yemen", 25235000)
  ,("North Korea", 24895000)
  ,("Ghana", 24658823)
  ,("Mozambique", 23700715)
  ,("Australia", 23488300)
  ,("Taiwan", 23382948)
  ,("Ivory Coast", 23202000)
  ,("Syria", 21898000)
  ,("Madagascar", 21263403)
  ,("Angola", 20609294)
  ,("Cameroon", 20386799)
  ,("Sri Lanka", 20277597)
  ,("Romania", 20121641)
  ,("Chile", 17620000)
  ,("Burkina Faso", 17322796)
  ,("Kazakhstan", 17221000)
  ,("Niger", 17138707)
  ,("Netherlands", 16850300)
  ,("Malawi", 16363000)
  ,("Guatemala", 15806675)]

## Main.hs
module Main where

import Countries -- source: Wikipedia
import Algorithm2 -- Change algorithm here

import Control.Concurrent
import Control.Concurrent.MVar
import Control.Monad
import System.Random
import System.Environment


-- Run it (assuming 4 cores). Bruteforce MVar concurrent
-- thread waiting
main :: IO ()
main = do
  (n:_) <- fmap (map read) getArgs
  let nc = n `div` 4
  done1 <- newEmptyMVar
  done2 <- newEmptyMVar
  done3 <- newEmptyMVar
  flip forkFinally (\_ -> putMVar done1 ()) $ replicateM_ nc run
  flip forkFinally (\_ -> putMVar done2 ()) $ replicateM_ nc run
  flip forkFinally (\_ -> putMVar done3 ()) $ replicateM_ nc run
  replicateM_ nc run
  takeMVar done1
  takeMVar done2
  takeMVar done3

run :: IO ()
run = do
  gen <- newStdGen
  print $ unCountry $ getCountry gen countries

## results.md

      
    Raw
  

              results.md
            
          
    Weighted "binary-search" randomness

I decided to measure two related algorithms that came up in a job interview. The idea is to randomly pick a country from a list of countries with their population, but with biased randomness towards higher poulations.
Algorithms are in Algorithm1.hs and Algorithm2.hs respectively.
Expectations: Algorithm1 is faster but significantly more biased. (Wrong! See bottom)
Algorithm1

Compiled with ghc -o countries -threaded -O2 Main.hs
[vincent@tazbook countries]$ time ./countries 4000000 +RTS -N4 | sort | uniq -c
  15716 ("Bangladesh",152518015)
 125087 ("Brazil",202550000)
1999377 ("China",1364410000)
      8 ("Democratic Republic of the Congo",67514000)
    135 ("Egypt",86446400)
    220 ("Ethiopia",86613986)
      3 ("France",65885000)
     69 ("Germany",80716000)
 999985 ("India",1244010000)
 250622 ("Indonesia",247424598)
     36 ("Iran",77427000)
   3853 ("Japan",127140000)
   1947 ("Mexico",119713203)
  31223 ("Nigeria",173615000)
  62077 ("Pakistan",186435000)
   1008 ("Philippines",99554400)
   7675 ("Russia",143700000)
      1 ("Thailand",64456700)
     18 ("Turkey",76667864)
      2 ("United Kingdom",63705000)
 500457 ("United States",318030000)
    481 ("Vietnam",89708900)

real	2m15.730s
user	3m0.523s
sys	1m29.130s

I ran it a few more times, yielding these timings:
[vincent@tazbook countries]$ time ./countries 4000000 +RTS -N4 > /dev/null 
real	2m16.750s
user	2m48.253s
sys	1m44.527s

[vincent@tazbook countries]$ time ./countries 4000000 +RTS -N4 > /dev/null
real	2m20.478s
user	3m5.947s
sys	1m35.910s


Algorithm2

Compiled with ghc -o countries -threaded -O2 Main.hs
[vincent@tazbook countries]$ time ./countries 4000000 +RTS -N4 | sort | uniq -c
  44124 ("Afghanistan",25500100)
   3608 ("Algeria",38700000)
  21878 ("Angola",20609294)
   7286 ("Argentina",42669500)
  43784 ("Australia",23488300)
  43850 ("Bangladesh",152518015)
 175573 ("Brazil",202550000)
  10967 ("Burkina Faso",17322796)
  21917 ("Burma",53259000)
  11038 ("Cameroon",20386799)
  44206 ("Canada",35344962)
  22145 ("Chile",17620000)
 350319 ("China",1364410000)
  44089 ("Colombia",47588000)
  88017 ("Democratic Republic of the Congo",67514000)
  43873 ("Egypt",86446400)
  43883 ("Ethiopia",86613986)
  43951 ("France",65885000)
  21904 ("Germany",80716000)
  10922 ("Ghana",24658823)
   1819 ("Guatemala",15806675)
 175428 ("India",1244010000)
  87465 ("Indonesia",247424598)
 175801 ("Iran",77427000)
  87408 ("Iraq",34035000)
  44074 ("Italy",60021955)
  21832 ("Ivory Coast",23202000)
  87582 ("Japan",127140000)
  21834 ("Kazakhstan",17221000)
  21941 ("Kenya",44354000)
  22089 ("Madagascar",21263403)
   3808 ("Malawi",16363000)
  88224 ("Malaysia",30113000)
  88199 ("Mexico",119713203)
  44043 ("Morocco",33257800)
  87250 ("Mozambique",23700715)
  21800 ("Nepal",26494504)
  11070 ("Netherlands",16850300)
  10912 ("Niger",17138707)
  88108 ("Nigeria",173615000)
  22105 ("North Korea",24895000)
  87717 ("Pakistan",186435000)
  43682 ("Peru",30475144)
  44041 ("Philippines",99554400)
 175561 ("Poland",38496000)
  22067 ("Romania",20121641)
 176142 ("Russia",143700000)
  43621 ("Saudi Arabia",29994272)
  87856 ("South Africa",52981991)
  44119 ("South Korea",50219669)
  21972 ("Spain",46609700)
  43970 ("Sri Lanka",20277597)
  88045 ("Sudan",37289406)
  43854 ("Syria",21898000)
  43920 ("Taiwan",23382948)
  21944 ("Tanzania",44928923)
  87904 ("Thailand",64456700)
  87425 ("Turkey",76667864)
  87592 ("Uganda",35357000)
  43696 ("Ukraine",45395604)
  43791 ("United Kingdom",63705000)
 175480 ("United States",318030000)
  21846 ("Uzbekistan",30183400)
  43913 ("Venezuela",28946101)
  87996 ("Vietnam",89708900)
  21720 ("Yemen",25235000)

real	0m51.041s
user	1m32.100s
sys	0m24.013s

Results

As expected, the distribution in the second algorithm is more uniform while preserving a certain bias towards the higher population entries.
The first algorithm has a much stronger bias.
The second algorithm scales reliably with larger datasets but can never complete until the whole tree has been walked, whilst the first algorithm can (50% chance) be 0(1), but also worst-case O(n).
Doing this has brought no new information to the computer science world, but it was fun!
	module Algorithm1 where

	import Countries
	import Data.List
	import System.Random

	-- Get random country, weighing the population more
	getCountry :: StdGen -> [Country] -> Country
	getCountry gen unsorted = helper gen countries
	where
	countries = sort unsorted
	helper _ [c] = c
	helper g (x:xs) =
	let (y, g2) = random g :: (Bool, StdGen)
	in if y then x else helper g2 xs
	module Algorithm2 where

	import Countries (Country)
	import Data.List
	import System.Random

	-- Get random country, weighing the population more
	getCountry :: StdGen -> [Country] -> Country
	getCountry gen unsorted = helper gen countries
	where
	countries = sort unsorted
	helper :: StdGen -> [Country] -> Country
	helper _ [c] = c
	helper g (a:b:[]) = fst $ ratherFirstThanLast g (a, b)
	helper g l =
	let halves = halveList l
	(pick, g2) = ratherFirstThanLast g halves
	in helper g2 pick

	-- Returns either the first or second element of the tuple,
	-- but more likely the first than the second
	ratherFirstThanLast :: StdGen -> (a, a) -> (a, StdGen)
	ratherFirstThanLast gen (a, b) =
	-- Get a number from 0-2...
	let (n, newGen) = randomR (0,2) gen :: (Int, StdGen)
	-- then return first element with 2:1 probability
	in if n == 0 then (b, newGen) else (a, newGen)

	-- Split a list in half
	halveList :: [a] -> ([a], [a])
	halveList l = splitAt (length l `div` 2) l
	-- List of many countries with according population
	module Countries where

	newtype Country = Country { unCountry :: (String, Int) }
	deriving (Eq, Show)

	-- Ordering by second tuple field
	-- Unsound but pragmatic instance :-)
	instance Ord Country where
	compare c1 c2 = compare (snd $ unCountry c2)
	(snd $ unCountry c1)

	countries :: [Country]
	countries = map Country
	[("China", 1364410000)
	,("India", 1244010000)
	,("United States", 318030000)
	,("Indonesia", 247424598)
	,("Brazil", 202550000)
	,("Pakistan", 186435000)
	,("Nigeria", 173615000)
	,("Bangladesh", 152518015)
	,("Russia", 143700000)
	,("Japan", 127140000)
	,("Mexico", 119713203)
	,("Philippines", 99554400)
	,("Vietnam", 89708900)
	,("Ethiopia", 86613986)
	,("Egypt", 86446400)
	,("Germany", 80716000)
	,("Iran", 77427000)
	,("Turkey", 76667864)
	,("Democratic Republic of the Congo", 67514000)
	,("France", 65885000)
	,("Thailand", 64456700)
	,("United Kingdom", 63705000)
	,("Italy", 60021955)
	,("Burma", 53259000)
	,("South Africa", 52981991)
	,("South Korea", 50219669)
	,("Colombia", 47588000)
	,("Spain", 46609700)
	,("Ukraine", 45395604)
	,("Tanzania", 44928923)
	,("Kenya", 44354000)
	,("Argentina", 42669500)
	,("Algeria", 38700000)
	,("Poland", 38496000)
	,("Sudan", 37289406)
	,("Uganda", 35357000)
	,("Canada", 35344962)
	,("Iraq", 34035000)
	,("Morocco", 33257800)
	,("Peru", 30475144)
	,("Uzbekistan", 30183400)
	,("Malaysia", 30113000)
	,("Saudi Arabia", 29994272)
	,("Venezuela", 28946101)
	,("Nepal", 26494504)
	,("Afghanistan", 25500100)
	,("Yemen", 25235000)
	,("North Korea", 24895000)
	,("Ghana", 24658823)
	,("Mozambique", 23700715)
	,("Australia", 23488300)
	,("Taiwan", 23382948)
	,("Ivory Coast", 23202000)
	,("Syria", 21898000)
	,("Madagascar", 21263403)
	,("Angola", 20609294)
	,("Cameroon", 20386799)
	,("Sri Lanka", 20277597)
	,("Romania", 20121641)
	,("Chile", 17620000)
	,("Burkina Faso", 17322796)
	,("Kazakhstan", 17221000)
	,("Niger", 17138707)
	,("Netherlands", 16850300)
	,("Malawi", 16363000)
	,("Guatemala", 15806675)]
	module Main where

	import Countries -- source: Wikipedia
	import Algorithm2 -- Change algorithm here

	import Control.Concurrent
	import Control.Concurrent.MVar
	import Control.Monad
	import System.Random
	import System.Environment


	-- Run it (assuming 4 cores). Bruteforce MVar concurrent
	-- thread waiting
	main :: IO ()
	main = do
	(n:_) <- fmap (map read) getArgs
	let nc = n `div` 4
	done1 <- newEmptyMVar
	done2 <- newEmptyMVar
	done3 <- newEmptyMVar
	flip forkFinally (\_ -> putMVar done1 ()) $ replicateM_ nc run
	flip forkFinally (\_ -> putMVar done2 ()) $ replicateM_ nc run
	flip forkFinally (\_ -> putMVar done3 ()) $ replicateM_ nc run
	replicateM_ nc run
	takeMVar done1
	takeMVar done2
	takeMVar done3

	run :: IO ()
	run = do
	gen <- newStdGen
	print $ unCountry $ getCountry gen countries