Last active
June 13, 2017 03:47
-
-
Save krishnanraman/4696053 to your computer and use it in GitHub Desktop.
Goal: Find THE FASTEST GROWING COUNTY IN THE UNITED STATES over the 1969-2011 timeframe.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Goal: Process the 12 million plus records | |
from: http://seer.cancer.gov/popdata/download.html | |
using: a Scala API atop Cascading, aka SCALDING ( Inventors: Avi Bryant, Oscar Boykin, Argyris ) | |
to find: | |
THE FASTEST GROWING COUNTY IN THE UNITED STATES over the 1969-2011 timeframe. | |
----------------------------------------------------------------------------- | |
RESULTS: Scroll to the very bottom. | |
First, the scalding source... | |
--- | |
import com.twitter.scalding.{Dsl,Source,TextLine,Job,Args,Tsv,RichPipe} | |
import cascading.pipe.Pipe | |
case class USPopulationSource(override val p:String) extends TextLine(p) { | |
override def transformForRead(pipe : Pipe) = { | |
import Dsl._ | |
RichPipe(pipe).mapTo('line->('year,'state,'fips,'isWhite,'isBlack, 'isHispanic,'isMale,'age,'population)) { | |
record:String => | |
val year:Int = record.slice(0,0+4).toInt | |
val state:String = record.slice(4,4+2) | |
val fips:String = record.slice(6,6+5) | |
val isWhite:Boolean = record.slice(13,13+1).toInt == 1 | |
val isBlack:Boolean = record.slice(13,13+1).toInt == 2 | |
val isHispanic:Boolean = record.slice(14,14+1).toInt == 1 | |
val isMale:Boolean = record.slice(15,15+1).toInt == 1 | |
val age:Int = 5*(record.slice(16,16+2).toInt -1) | |
val population:Int = record.slice(18,18+8).toInt | |
(year,state,fips,isWhite,isBlack, isHispanic,isMale,age,population) | |
} | |
} | |
} | |
class PopulationStats(args:Args) extends Job(args) { | |
val people = USPopulationSource("pop.txt").read | |
val fipspipe = TextLine("US_FIPS_Codes.csv").read.mapTo('line->('state,'county,'fips)) { | |
line:String => | |
var arr = line.split(",") | |
(arr(0),arr(1),(arr(2)+arr(3))) | |
} | |
people.groupBy('year, 'fips){ | |
group => group.plus[Int]('population->'population) | |
}.groupBy('fips) { | |
val init = (0,0.0d) | |
type X = (Int,Double) | |
type T = (Int,Int) | |
// foldLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X) | |
group => group.foldLeft[X,T]( ('population,'year) -> ('dummy,'growth))(init:X) { | |
(x:X, t:T) => | |
val (population,year) = t | |
val (dummy, growth ) = x | |
year match { | |
case 1969 => (population, 0.0d) | |
case 2011 => (population,(population-dummy)/(dummy+0.0d)) | |
case _ => if (dummy==0) (population,0.0d) else (dummy,0.0d) | |
} | |
} | |
}.project('fips,'growth) | |
.joinWithSmaller(('fips-> 'fips), fipspipe) | |
.groupAll(_.sortBy('growth)) | |
.write(Tsv("growth.txt")) | |
} | |
------ | |
RESULTS: | |
13053 -0.5959766162310867 Georgia Chattahoochee | |
38083 -0.5825892857142857 North Dakota Sheridan | |
38013 -0.575839766325892 North Dakota Burke | |
54047 -0.5703608502224419 West Virginia McDowell | |
48101 -0.5474033816425121 Texas Cottle | |
38047 -0.5436781609195402 North Dakota Logan | |
38023 -0.5362287210824968 North Dakota Divide | |
38087 -0.5334632878492528 North Dakota Slope | |
38037 -0.5310054184226369 North Dakota Grant | |
48301 -0.5276381909547738 Texas Loving | |
---- some 3000 counties here ---- | |
13117 10.637015231025215 Georgia Forsyth | |
12097 10.880533448053345 Florida Osceola | |
48397 10.907133440749963 Texas Rockwall | |
13135 11.063892016788289 Georgia Gwinnett | |
48157 11.168507788849015 Texas Fort Bend | |
48491 11.323119312014695 Texas Williamson | |
08117 11.39344262295082 Colorado Summit | |
48085 11.507137247655564 Texas Collin | |
12035 21.135939986360537 Florida Flagler | |
08035 36.49576488706366 Colorado Douglas | |
CONCLUSION: | |
Over the 1969-2011 timeframe, Douglas County experienced the highest population growth ( 3600% ) | |
The top-10 counties with the most pop growth are shown above - they are in Texas, Florida & Colorado. | |
The botom-10 counties are also showh above - North Dakota expectedly hosts 6 of the bottom 10. | |
Execution Time: 29 seconds, in Scalding local mode, on my powermac | |
Scalding FTW! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment