Created
January 18, 2013 06:50
-
-
Save esammer/4562848 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* A simple example of user demographic dataset generation in scala. | |
* | |
* This highlights the following scala bits: | |
* - Java integration | |
* - Annotations | |
* - Tail recursion optimization | |
* - Enums | |
* - Vals vs. vars | |
* - Working with singleton objects | |
* - Case classes | |
* - Type inference | |
* - Function values / literals / whatever you want to call them | |
* - For loops and generators | |
* - Pattern matching with: | |
* - Wildcards for members we don't care about | |
* - Projection of members | |
* - Guards on case statements | |
* - Closures | |
* - String interpolation of expressions | |
* - A neat basis for generating data that conforms to rules | |
* | |
* None of these features are used in contrived or oh-look-how-smart-i-am way, | |
* so it should be helpful to those looking to play around with scala. | |
* | |
* Run this with: | |
* scala usergen.scala | |
*/ | |
import java.util.Random | |
import java.util.UUID | |
import java.lang.Math | |
import scala.annotation.tailrec | |
/* | |
* Gender enum for users. | |
*/ | |
object Gender extends Enumeration { | |
val MALE = Value | |
val FEMALE = Value | |
val OTHER = Value | |
val UNKNOWN = Value | |
/* | |
* Cache a version of the values in a List[Gender.Value] to simplify random | |
* selection by index. | |
*/ | |
val valuesAsList = values.toList | |
} | |
/* | |
* User demographic entity. | |
*/ | |
case class User( | |
id: String, | |
age: Int, | |
gender: Gender.Value, | |
income: Int | |
) | |
/* | |
* Given a Random instance, generate a User with: | |
* - A random UUID as the id. | |
* - One of the four genders (uniform dist) | |
* - A random income with a cap of 300K (uniform dist) | |
* | |
* We do not worry about the significance of the values here. Instead, we'll | |
* wrap this function in a function that applies a predicate to the generated | |
* users and tests them for acceptance. This is CPU-wasteful, but much simpler | |
* to maintain (and get the distributions right for dummies like me). | |
*/ | |
def generate(rand: Random): User = { | |
User( | |
UUID.randomUUID().toString, | |
Math.abs(rand.nextInt(100)), | |
Gender.valuesAsList(rand.nextInt(Gender.values.size)), | |
Math.abs(rand.nextInt(300000)) | |
) | |
} | |
/* | |
* Given a Random instance, apply a predicate to generated users until one | |
* meets whatever criteria the user chooses. The predicate is just a plain old | |
* function that takes a User and returns true or false. Users will | |
* continuously be generated until the predicate returns true. For highly | |
* selective predicates, this function can take a while. | |
* | |
* (This function is annotated with @tailrec so scala will warn us if tail | |
* recursion optimization is not possible, so depth shouldn't be a problem.) | |
*/ | |
@tailrec | |
def selectiveGenerate(rand: Random, predicate: User => Boolean): User = { | |
val user = generate(rand) | |
if (predicate(user)) { | |
user | |
} else { | |
selectiveGenerate(rand, predicate) | |
} | |
} | |
val rand = new Random(53) | |
var attempts: Int = 0 | |
for (i <- 0 to 100) { | |
val user = selectiveGenerate( | |
rand, | |
u => { // this is an anonymous function. The type of `u` is inferred. | |
attempts += 1 // count the number of attempts to match a user for fun | |
/* | |
* We use scala's pattern matching, with guards (the ifs), and some | |
* simple float value tests to decide which users to take. The i @ _ | |
* syntax projects (i.e. captures, and binds to the variable `i`) the | |
* User's income (the forth argument to the User constructor). | |
* | |
* Patterns are matched eagerly; the first match wins. | |
*/ | |
u match { | |
case User(_, age @ _, _, i @ _) if age < 18 && i > 8000 => false | |
case User(_, _, Gender.MALE, i @ _) if i > 14000 && i < 20000 => rand.nextFloat < 0.2 | |
case User(_, _, Gender.MALE, i @ _) if i > 14000 && i < 80000 => rand.nextFloat < 0.4 | |
case User(_, _, Gender.FEMALE, i @ _) if i > 14000 && i < 40000 => rand.nextFloat < 0.1 | |
case User(_, _, Gender.FEMALE, i @ _) if i > 14000 && i < 120000 => rand.nextFloat < 0.3 | |
case _ => false | |
} | |
} | |
) | |
/* | |
* Doing something with this data is left as an exercise to the reader. | |
*/ | |
println(s"user:${user}") | |
} | |
println(s"attempts:${attempts} matchRate:${100.0 / attempts}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment