krishnanraman/zipcodes of wealthy elite

## gistfile1.scala
/*
Goal: Use Scalding to datamine the 2010 US Census data (kindly provided by @ElonAzoulay & @hmason), to find
Where do the WEALTHY WELL EDUCATED ELITE live ?
WEALTHY == house value quarter million, household income 150k
WELL EDUCATED == sort by edu, edu = (10 * Phd + 5 * MS + 1 * BS) score
*/
import com.twitter.scalding._
import cascading.tuple.Fields
import cascading.tap.SinkMode

class CensusJob(args : Args) extends Job(args) {

  val filename = "census_2010_acs.csv"

  val columns = Seq("postal_code","state","region","division","statefips","latitude","longitude",
    "land_area_sq_meters","total_population","population_density","housing_unit_count",
    "white_population","black_population","native_population","asian_population","hawaiian_population","other_population",
    "nonhispanic","nonhispanic_white","nonhispanic_black","nonhispanic_native","nonhispanic_asian","nonhispanic_hawaiian","nonhispanic_other",
    "hispanic","hispanic_white","hispanic_black","hispanic_native","hispanic_asian","hispanic_hawaiian","hispanic_other",
    "male_population","female_population",
    "medianage","medianage_male","medianage_female","median_house_value","households","avg_household_size","households_with_minors",
    "median_household_income","median_household_income_white","median_household_income_black","median_household_income_native","median_household_income_asian","median_household_income_hawaiian","median_household_income_other","median_household_income_nonhispanic","median_household_income_hispanic",
    "education_level_noschool_pct","education_level_4thgrade_pct","education_level_6thgrade_pct","education_level_8thgrade_pct","education_level_9thgrade_pct","education_level_10thgrade_pct","education_level_11thgrade_pct","education_level_12thgrade_pct","education_level_highschool_pct","education_level_college_lessthan1year_pct","education_level_college_morethan1year_pct","education_level_college_associates_pct","education_level_college_bachelors_pct","education_level_college_masters_pct","education_level_college_professionaldegree_pct","education_level_college_doctorate_pct","avg_education_years")
  val fields = new Fields(columns: _*)
  val input = Csv(filename,",", fields,false, false, "\"",SinkMode.REPLACE)
  val output = Tsv("output.tsv")

  type X = (String,String,Double, Long, Double)
  type Y = List[X]

  input
    .read
    // keep only those columns I care about
    .project('postal_code, 'state,'medianage, 'median_house_value, 'median_household_income, 'education_level_college_bachelors_pct, 'education_level_college_masters_pct, 'education_level_college_doctorate_pct)

    // houses worth 250k or more
    .filter('median_house_value) {
      x:Double => x > 250000.0d
    }

    // don't want to live in a retirement community
    .filter('medianage) {
      x:Double => x < 50
    }

    // atleast six figure household income, 2 parents * 75K = 150k
    .filter('median_household_income) {
      x:Long => x > 150000
    }

    // place a very high value on academia
    .map(('education_level_college_doctorate_pct, 'education_level_college_masters_pct, 'education_level_college_bachelors_pct) -> 'edu){
      x:(Double,Double,Double) => (10*x._1 + 5*x._2 + x._3 )
    }

    .project('postal_code, 'state, 'edu, 'median_house_value, 'median_household_income)
    .groupAll {

      val init: Y = Nil
      group =>
        group.foldLeft[Y,X] (('postal_code, 'state, 'edu, 'median_house_value, 'median_household_income) -> 'res)(init) {
          (a:Y, b:X) => a ++ List((b._1, b._2, b._3, b._4, b._5))
        }
    }.mapTo('* -> '*) {
      x: Y =>
      x
        .sortBy(y => -y._3)
        .groupBy(y => y._2)
        .map(kv => { val (x,y) = kv; x + " -> " + y.map( z => z._1).mkString(",") })
        .mkString("\n")
    }
    .write(output)
}

## zipcodes of wealthy elite
MA -> 02468,01741,01467,02481,01770,01776,02493,02030
NM -> 88040
IL -> 60043,60022,60521,60029
MO -> 63073,63005
WA -> 98039,98050
NJ -> 08558,08550,07043,08502,07046,07078,07028,07739,08836,07450,07945,07021,07930,07423,07458,07481,07979,07931,07722,07620
TX -> 77094,76092,76034
MD -> 20686,21153,20816,20817,20854,21737,21029,21036,20777,20818,21794,21738,20759,21405
CT -> 06870,06883,06820,06840,06880,06903,06890,06897,06878,06092
FL -> 32461
PA -> 19066,19085,19425
NH -> 03604
CA -> 94022,94024,95070,94027,94563,91108,92657,90272,94506,90077,90274,92145,94507,92861,92678
UT -> 84731
AL -> 36473
VA -> 22027,22101,22181,22182,22207,22066,22124,22039,22308,20124,23129
NY -> 10527,10518,10576,10510,10514,10597,10282,10007,10583,10804,11753,10577,10069,10506,11568,10504,11030,11576,11765,06390,10914
DE -> 19807


Takeaways:
0. Census Data as CSV here: http://tech.gilt.com/post/55205886124/making-cents-of-census-data, thanks to @ElonAzoulay

1. The census data has 66 fields, but Scala supports only upto Tuple22.
Have to use Cascading Fields for the Csv, cannot simply specify column names as a tuple.

2. The 32000+ zipcodes after being subject to filters reduces to ~100, which comfortably fits in a regular Scala List.
So we run a fold in the reducer & aggregate into a single scala list.
This is useful because we can do custom sorting, grouping, output formatting/pretty-printing much more comfortably in straight scala than with the DSL.

3. To find the mapping of zipcode to actual US city, use maps.google.com & type in the zipcode.
	/*
	Goal: Use Scalding to datamine the 2010 US Census data (kindly provided by @ElonAzoulay & @hmason), to find
	Where do the WEALTHY WELL EDUCATED ELITE live ?
	WEALTHY == house value quarter million, household income 150k
	WELL EDUCATED == sort by edu, edu = (10 * Phd + 5 * MS + 1 * BS) score
	*/
	import com.twitter.scalding._
	import cascading.tuple.Fields
	import cascading.tap.SinkMode

	class CensusJob(args : Args) extends Job(args) {

	val filename = "census_2010_acs.csv"

	val columns = Seq("postal_code","state","region","division","statefips","latitude","longitude",
	"land_area_sq_meters","total_population","population_density","housing_unit_count",
	"white_population","black_population","native_population","asian_population","hawaiian_population","other_population",
	"nonhispanic","nonhispanic_white","nonhispanic_black","nonhispanic_native","nonhispanic_asian","nonhispanic_hawaiian","nonhispanic_other",
	"hispanic","hispanic_white","hispanic_black","hispanic_native","hispanic_asian","hispanic_hawaiian","hispanic_other",
	"male_population","female_population",
	"medianage","medianage_male","medianage_female","median_house_value","households","avg_household_size","households_with_minors",
	"median_household_income","median_household_income_white","median_household_income_black","median_household_income_native","median_household_income_asian","median_household_income_hawaiian","median_household_income_other","median_household_income_nonhispanic","median_household_income_hispanic",
	"education_level_noschool_pct","education_level_4thgrade_pct","education_level_6thgrade_pct","education_level_8thgrade_pct","education_level_9thgrade_pct","education_level_10thgrade_pct","education_level_11thgrade_pct","education_level_12thgrade_pct","education_level_highschool_pct","education_level_college_lessthan1year_pct","education_level_college_morethan1year_pct","education_level_college_associates_pct","education_level_college_bachelors_pct","education_level_college_masters_pct","education_level_college_professionaldegree_pct","education_level_college_doctorate_pct","avg_education_years")
	val fields = new Fields(columns: _*)
	val input = Csv(filename,",", fields,false, false, "\"",SinkMode.REPLACE)
	val output = Tsv("output.tsv")

	type X = (String,String,Double, Long, Double)
	type Y = List[X]

	input
	.read
	// keep only those columns I care about
	.project('postal_code, 'state,'medianage, 'median_house_value, 'median_household_income, 'education_level_college_bachelors_pct, 'education_level_college_masters_pct, 'education_level_college_doctorate_pct)

	// houses worth 250k or more
	.filter('median_house_value) {
	x:Double => x > 250000.0d
	}

	// don't want to live in a retirement community
	.filter('medianage) {
	x:Double => x < 50
	}

	// atleast six figure household income, 2 parents * 75K = 150k
	.filter('median_household_income) {
	x:Long => x > 150000
	}

	// place a very high value on academia
	.map(('education_level_college_doctorate_pct, 'education_level_college_masters_pct, 'education_level_college_bachelors_pct) -> 'edu){
	x:(Double,Double,Double) => (10x._1 + 5x._2 + x._3 )
	}

	.project('postal_code, 'state, 'edu, 'median_house_value, 'median_household_income)
	.groupAll {

	val init: Y = Nil
	group =>
	group.foldLeft[Y,X] (('postal_code, 'state, 'edu, 'median_house_value, 'median_household_income) -> 'res)(init) {
	(a:Y, b:X) => a ++ List((b._1, b._2, b._3, b._4, b._5))
	}
	}.mapTo('* -> '*) {
	x: Y =>
	x
	.sortBy(y => -y._3)
	.groupBy(y => y._2)
	.map(kv => { val (x,y) = kv; x + " -> " + y.map( z => z._1).mkString(",") })
	.mkString("\n")
	}
	.write(output)
	}
	MA -> 02468,01741,01467,02481,01770,01776,02493,02030
	NM -> 88040
	IL -> 60043,60022,60521,60029
	MO -> 63073,63005
	WA -> 98039,98050
	NJ -> 08558,08550,07043,08502,07046,07078,07028,07739,08836,07450,07945,07021,07930,07423,07458,07481,07979,07931,07722,07620
	TX -> 77094,76092,76034
	MD -> 20686,21153,20816,20817,20854,21737,21029,21036,20777,20818,21794,21738,20759,21405
	CT -> 06870,06883,06820,06840,06880,06903,06890,06897,06878,06092
	FL -> 32461
	PA -> 19066,19085,19425
	NH -> 03604
	CA -> 94022,94024,95070,94027,94563,91108,92657,90272,94506,90077,90274,92145,94507,92861,92678
	UT -> 84731
	AL -> 36473
	VA -> 22027,22101,22181,22182,22207,22066,22124,22039,22308,20124,23129
	NY -> 10527,10518,10576,10510,10514,10597,10282,10007,10583,10804,11753,10577,10069,10506,11568,10504,11030,11576,11765,06390,10914
	DE -> 19807



	Takeaways:
	0. Census Data as CSV here: http://tech.gilt.com/post/55205886124/making-cents-of-census-data, thanks to @ElonAzoulay

	1. The census data has 66 fields, but Scala supports only upto Tuple22.
	Have to use Cascading Fields for the Csv, cannot simply specify column names as a tuple.

	2. The 32000+ zipcodes after being subject to filters reduces to ~100, which comfortably fits in a regular Scala List.
	So we run a fold in the reducer & aggregate into a single scala list.
	This is useful because we can do custom sorting, grouping, output formatting/pretty-printing much more comfortably in straight scala than with the DSL.

	3. To find the mapping of zipcode to actual US city, use maps.google.com & type in the zipcode.