Michael Reynolds reynoldsm88

## StaticIntializerOverride.java
public class MySuperClass {

    private Map<String,String> values;

    // static initializer parent
    {
      values = new HashMap<String,String>();
      values.put( "value", "super" );
    }


## spark-internals-through-code.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                reynoldsm88
                / spark-internals-through-code.md
            
            
              Created
              March 31, 2017 03:28
                — forked from smartnose/spark-internals-through-code.md
            
              
                Spark internal notes
              
          
    Spark internals through code

Nothing gives you more detail about spark internals than actually reading it source code. In addition, you get to learn many design techniques and improve your scala coding skills. These are the random notes I make while reading the spark code. The best way to comprehend the notes is to load spark code into an IDE, e.g. IntelliJ, and navigate the code on the side.
Genesis - creation of a spark cluster

The scripts for creating a spark cluster are:
start-master.sh and start-slave.sh. Read them carefully, and you can see that both scripts are very similar except the values for $CLASS variable. For start-master.sh, the value is CLASS="org.apache.spark.deploy.master.Master", while the value for start-slave.sh is shown below with more context.
# NOTE: This exact class name is matched downstream by SparkSubmit.

  
## build.sbt
organization := "net.seratch"

name := "sandbox"

version := "0.1"

scalaVersion := "2.9.1"

libraryDependencies ++= Seq(
  "junit"          % "junit"     % "4.9"   withSources(),

## gist:94e7244554f3d59877fec10eb26a5f59
<build>
        <defaultGoal>install</defaultGoal>
        <plugins>
            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>${java.version}</source>
                    <target>${java.version}</target>
                </configuration>
            </plugin>

## get_job_status.sh
curl http://spark-cluster-ip:6066/v1/submissions/status/driver-20151008145126-0000

## Kie Maven Plugin Eclipse Lifecycle Mapping
<pluginManagement>
            <plugins>
                <!--This plugin's configuration is used to store Eclipse m2e settings
                    only. It has no influence on the Maven build itself. -->
                <plugin>
                    <groupId>org.eclipse.m2e</groupId>
                    <artifactId>lifecycle-mapping</artifactId>
                    <version>1.0.0</version>
                    <configuration>
                        <lifecycleMappingMetadata>

## OCP create PVC claim
POD=$(oc get pod | grep <xyz | awk '{print $1}'
oc set volume <DC> --add --claim-name=<name> --type pvc --claim-size=1G --mount-path /remote/data
oc rollout latest <DC>

oc rsync /local/data:$POD/remote/data

## CountSerDe.scala
package org.apache.spark.countSerDe

import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction

## sbt-exclusion.scala
lazy val root = ( project in file( "." ) ).settings( libraryDependencies ++= elastic4s
                                                                             ++ scalaTest
                                                                             ++ betterFiles
                                                                             ++ commonsTestBase,
                                                     excludeDependencies ++= Seq( ExclusionRule( "org.slf4j", "slf4j-log4j12" ) ) )

## sbt-hard-mode.sbt
lazy val root = ( project in file( "." ) ).settings( libraryDependencies ++= clulabProcessors
                                                                             ++ kafka
                                                                             ++ logging
                                                                             ++ scalaTest
                                                                             ++ embeddedKafka
                                                                             ++ scalaMock, // the mess below is to resolve conflicting versions of various dependencies
                                                     excludeDependencies ++= Seq( ExclusionRule( "org.slf4j", "slf4j-log4j12" ),
                                                                                  ExclusionRule( "javax.ws.rs", "javax.ws.rs-api" ), // out of date because of oracle jee debacle
                                                                                  ExclusionRule(
	public class MySuperClass {

	private Map<String,String> values;

	// static initializer parent
	{
	values = new HashMap<String,String>();
	values.put( "value", "super" );
	}
	organization := "net.seratch"

	name := "sandbox"

	version := "0.1"

	scalaVersion := "2.9.1"

	libraryDependencies ++= Seq(
	"junit" % "junit" % "4.9" withSources(),
	<build>
	<defaultGoal>install</defaultGoal>
	<plugins>
	<plugin>
	<artifactId>maven-compiler-plugin</artifactId>
	<configuration>
	<source>${java.version}</source>
	<target>${java.version}</target>
	</configuration>
	</plugin>
	<pluginManagement>
	<plugins>
	<!--This plugin's configuration is used to store Eclipse m2e settings
	only. It has no influence on the Maven build itself. -->
	<plugin>
	<groupId>org.eclipse.m2e</groupId>
	<artifactId>lifecycle-mapping</artifactId>
	<version>1.0.0</version>
	<configuration>
	<lifecycleMappingMetadata>
	POD=$(oc get pod \| grep <xyz \| awk '{print $1}'
	oc set volume <DC> --add --claim-name=<name> --type pvc --claim-size=1G --mount-path /remote/data
	oc rollout latest <DC>

	oc rsync /local/data:$POD/remote/data
	package org.apache.spark.countSerDe

	import org.apache.spark.sql.catalyst.util._
	import org.apache.spark.sql.types._
	import org.apache.spark.sql.Row
	import org.apache.spark.sql.catalyst.InternalRow
	import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
	import org.apache.spark.sql.expressions.MutableAggregationBuffer
	import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
	lazy val root = ( project in file( "." ) ).settings( libraryDependencies ++= elastic4s
	++ scalaTest
	++ betterFiles
	++ commonsTestBase,
	excludeDependencies ++= Seq( ExclusionRule( "org.slf4j", "slf4j-log4j12" ) ) )
	lazy val root = ( project in file( "." ) ).settings( libraryDependencies ++= clulabProcessors
	++ kafka
	++ logging
	++ scalaTest
	++ embeddedKafka
	++ scalaMock, // the mess below is to resolve conflicting versions of various dependencies
	excludeDependencies ++= Seq( ExclusionRule( "org.slf4j", "slf4j-log4j12" ),
	ExclusionRule( "javax.ws.rs", "javax.ws.rs-api" ), // out of date because of oracle jee debacle
	ExclusionRule(