James Allen jlln

## AnimalRescue.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jlln
                / AnimalRescue.ipynb
            
            
              Last active
              June 27, 2016 05:55
            
              
                Exploratory data analysis for the Kaggle Shelter Animal Outcome Project https://www.kaggle.com/c/shelter-animal-outcomes. Amongst other things I tried using the classifications of dogs according to The Kennel Club to predict outcomes.
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## RHCPropensity.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jlln
                / RHCPropensity.ipynb
            
            
              Last active
              September 20, 2016 04:18
            
              
                RHC Propensity Analysis
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Tigers.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                jlln
                / Tigers.ipynb
            
            
              Created
              May 10, 2016 05:35
            
              
                Hypergeometric sampling of Tigers.
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## group_fractions_pandas.py
def groupCountFractionals(dataframe,target,outer):
    '''
    dataframe: a pandas dataframe
    target: a string corresponding to the column of interest in the dataframe
    outer: a list of the columns by which the counts should be conditioned

    Returns the fraction of target_criteria_group / outer_criteria_group counts.

    Be mindful to take group sizes (Outer Count) into consideration.
    As outer count gets smaller, the fraction value

## spark_group_fraction.scala
def groupOutcomeFractions(df:DataFrame,outcome:String,outer_group_criteria:Seq[String]):DataFrame = {
    df.registerTempTable("df")
    val count_variable:String = outer_group_criteria.head
    val inner_group_criteria = outer_group_criteria :+ outcome
    val outer_group_query = "SELECT "+ outer_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS outer_count FROM df GROUP BY  " + outer_group_criteria.mkString(" , ")
    val outer_count = sqlContext.sql(outer_group_query)
    val inner_count_query = "SELECT "+ inner_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS inner_count FROM df GROUP BY  " + inner_group_criteria.mkString(" , ")
    val inner_count = sqlContext.sql(inner_count_query)
    val combined_counts = inner_count.join(outer_count,outer_group_criteria)


## spark_df_pivot.scala
val cameo_maps = event_data_ag1.rdd
    .groupBy(x=> (x.getAs[String]("Country"),x.getAs[Int]("ElapsedMonths")))
    .map { case (group_features,codes) => group_features -> codes
    .map {code => code.getAs[Int]("CAMEO Code") -> code.getAs[Long]("count") }
    .toMap
    }
val cameos = sc.broadcast(cameo_maps.map(_._2.keySet).reduce(_ union _).toArray.sorted)

val cameo_arrays = cameo_maps.map{
    case ((country,total_months),cameo_map) => (country,total_months) -> cameos.value.map(cameo_map.getOrElse(_,0L))

## spark OneHot encoder.scala
import scala.collection.JavaConverters._
import org.apache.spark.sql.types.{StructType,StructField,StringType}
import org.apache.spark.sql.Row


def identityMatrix(n:Int):Array[Array[String]]=Array.tabulate(n,n)((x,y) => if(x==y) "1" else "0")
def encodeStringOneHot(table:org.apache.spark.sql.DataFrame,column:String) = {
    //Accepts the dataframe and the target column name. Returns a new dataframe in which the target column has been replaced with a one-hot/dummy encoding.
    table.registerTempTable("temp")

## separator.py
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split

    returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
	def groupCountFractionals(dataframe,target,outer):
	'''
	dataframe: a pandas dataframe
	target: a string corresponding to the column of interest in the dataframe
	outer: a list of the columns by which the counts should be conditioned

	Returns the fraction of target_criteria_group / outer_criteria_group counts.

	Be mindful to take group sizes (Outer Count) into consideration.
	As outer count gets smaller, the fraction value
	def groupOutcomeFractions(df:DataFrame,outcome:String,outer_group_criteria:Seq[String]):DataFrame = {
	df.registerTempTable("df")
	val count_variable:String = outer_group_criteria.head
	val inner_group_criteria = outer_group_criteria :+ outcome
	val outer_group_query = "SELECT "+ outer_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS outer_count FROM df GROUP BY " + outer_group_criteria.mkString(" , ")
	val outer_count = sqlContext.sql(outer_group_query)
	val inner_count_query = "SELECT "+ inner_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS inner_count FROM df GROUP BY " + inner_group_criteria.mkString(" , ")
	val inner_count = sqlContext.sql(inner_count_query)
	val combined_counts = inner_count.join(outer_count,outer_group_criteria)
	val cameo_maps = event_data_ag1.rdd
	.groupBy(x=> (x.getAs[String]("Country"),x.getAs[Int]("ElapsedMonths")))
	.map { case (group_features,codes) => group_features -> codes
	.map {code => code.getAs[Int]("CAMEO Code") -> code.getAs[Long]("count") }
	.toMap
	}
	val cameos = sc.broadcast(cameo_maps.map(_._2.keySet).reduce(_ union _).toArray.sorted)

	val cameo_arrays = cameo_maps.map{
	case ((country,total_months),cameo_map) => (country,total_months) -> cameos.value.map(cameo_map.getOrElse(_,0L))
	import scala.collection.JavaConverters._
	import org.apache.spark.sql.types.{StructType,StructField,StringType}
	import org.apache.spark.sql.Row



	def identityMatrix(n:Int):Array[Array[String]]=Array.tabulate(n,n)((x,y) => if(x==y) "1" else "0")
	def encodeStringOneHot(table:org.apache.spark.sql.DataFrame,column:String) = {
	//Accepts the dataframe and the target column name. Returns a new dataframe in which the target column has been replaced with a one-hot/dummy encoding.
	table.registerTempTable("temp")
	def splitDataFrameList(df,target_column,separator):
	''' df = dataframe to split,
	target_column = the column containing the values to split
	separator = the symbol used to perform the split

	returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
	The values in the other columns are duplicated across the newly divided rows.
	'''
	def splitListToRows(row,row_accumulator,target_column,separator):
	split_row = row[target_column].split(separator)