Imran Rashid squito

## shuffle_corrupt_test.scala
// run with "--conf spark.cleaner.referenceTracking=false"
// spin up our full set of executors
sc.parallelize(1 to 100, 100).map { x => Thread.sleep(1000); x}.collect()

def getLocalDirs(): Array[String] = {
  val clz = Class.forName("org.apache.spark.util.Utils")
  val conf = org.apache.spark.SparkEnv.get.conf
  val method = clz.getMethod("getConfiguredLocalDirs", conf.getClass())
  method.setAccessible(true)
  method.invoke(null, conf).asInstanceOf[Array[String]]

## AccumulatorListener.scala
import scala.collection.mutable.Map

import org.apache.spark.{Accumulator, AccumulatorParam, SparkContext}
import org.apache.spark.scheduler.{SparkListenerStageCompleted, SparkListener}
import org.apache.spark.SparkContext._


/**
 * just print out the values for all accumulators from the stage.
 * you will only get updates from *named* accumulators, though

## getPaths.scala
import java.lang.reflect.Method

import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation}
import org.apache.spark.sql.DataFrame

def getPaths(relation: BaseRelation): Iterator[String] = {
  relation match {
    case hr: HadoopFsRelation =>
      hr.paths.toIterator

## SlowIterator.scala
// This is an example iterator that runs slowly, to demonstrate how SlowLoggingIterator works
// it just iterates over a range of ints, but puts in occassional delays, to simulate an iterator that is
// actually doing something more complex, eg. fetching records from a DB which is occassionaly slow.

class SlowIterator(start: Int, end: Int, delay: Long, every: Int) extends java.util.Iterator[Integer] {
  val underlying = (start until end).toIterator

  def hasNext(): Boolean = underlying.hasNext

  def next(): Integer = {

## tl.out
creating a new thread pool thread 0 : tl = null
creating a new thread pool thread 1 : tl = null
creating a new thread pool thread 2 : tl = null
creating a new thread pool thread 3 : tl = null
creating a new thread pool thread 4 : tl = null
creating a new thread pool thread 5 : tl = null
creating a new thread pool thread 6 : tl = null
creating a new thread pool thread 7 : tl = null
creating a new thread pool thread 8 : tl = null
creating a new thread pool thread 9 : tl = null

## gist:de73fbd0b9c00961377068b91283e04c
# filters out "apachespark" choice now
# notice what happens with
#  * bad input ("floop")
#  * real user that can't be assigned a jira ("fakeimran")
#  * selection from list ("imran")
#  * arbitrary user that can be assigned ("vanzin")


In [1]: from merge_spark_pr import *

## equality_parsing_psql_timestamp.md

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                squito
                / equality_parsing_psql_timestamp.md
            
            
              Last active
              September 11, 2017 16:27
            
              
                example of how postgres treats different timestamp types, both parsing and as the timezone is changed
              
          
    References


https://stackoverflow.com/a/9576170/1442961
https://dba.stackexchange.com/q/59006/134391
timestamp means timestamp without time zone, SQL spec, 6.1, point 35:


If  is not specified, then WITHOUT TIME ZONE is implicit


Parsing Strings


## gist:ccd56fefefe4dfef808dc21196a89385
// paste in
// https://gist.githubusercontent.com/squito/329d9cd82a21f645d592/raw/ca9217708293d6fe69ed6638f4feeb3038f8fd9c/reflector.scala

val xCat = spark.sessionState.catalog.externalCatalog

val catClient = get(xCat, "client")
catClient.reflectMethod("getConf", Seq("hive.metastore.uris", ""))
import org.apache.hadoop.fs.{FileSystem, Path}
val fs = FileSystem.get(sc.hadoopConfiguration)

## LA_output.txt
> scala -Duser.timezone=America/Los_Angeles timestamp.scala
Defaul TZ: America/Los_Angeles
hours in UTC: 8
TZ offset in hours: -8

## macro compiler error.scala
scala>   def classExpandMacroImpl(c: Context)(s: c.Expr[Any]) : c.Expr[Any] = {
     |     import c.universe._
     |
     |     val cdef = s.tree match {
     |       case Block(List(x:ClassDef), _) => x
     |       case _ => c.abort(c.enclosingPosition, "Was expecting a block w/ a ClassDef")
     |     }
     |
     |     val q"class $name { ..$body }" = cdef
     |     val newdefs = List[c.universe.Tree](q"def x: Int = z + 7", q"def y: Float = z + 3.2f")
	// run with "--conf spark.cleaner.referenceTracking=false"
	// spin up our full set of executors
	sc.parallelize(1 to 100, 100).map { x => Thread.sleep(1000); x}.collect()

	def getLocalDirs(): Array[String] = {
	val clz = Class.forName("org.apache.spark.util.Utils")
	val conf = org.apache.spark.SparkEnv.get.conf
	val method = clz.getMethod("getConfiguredLocalDirs", conf.getClass())
	method.setAccessible(true)
	method.invoke(null, conf).asInstanceOf[Array[String]]
	import scala.collection.mutable.Map

	import org.apache.spark.{Accumulator, AccumulatorParam, SparkContext}
	import org.apache.spark.scheduler.{SparkListenerStageCompleted, SparkListener}
	import org.apache.spark.SparkContext._


	/**
	* just print out the values for all accumulators from the stage.
	* you will only get updates from named accumulators, though
	import java.lang.reflect.Method

	import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
	import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation}
	import org.apache.spark.sql.DataFrame

	def getPaths(relation: BaseRelation): Iterator[String] = {
	relation match {
	case hr: HadoopFsRelation =>
	hr.paths.toIterator
	// This is an example iterator that runs slowly, to demonstrate how SlowLoggingIterator works
	// it just iterates over a range of ints, but puts in occassional delays, to simulate an iterator that is
	// actually doing something more complex, eg. fetching records from a DB which is occassionaly slow.

	class SlowIterator(start: Int, end: Int, delay: Long, every: Int) extends java.util.Iterator[Integer] {
	val underlying = (start until end).toIterator

	def hasNext(): Boolean = underlying.hasNext

	def next(): Integer = {
	creating a new thread pool thread 0 : tl = null
	creating a new thread pool thread 1 : tl = null
	creating a new thread pool thread 2 : tl = null
	creating a new thread pool thread 3 : tl = null
	creating a new thread pool thread 4 : tl = null
	creating a new thread pool thread 5 : tl = null
	creating a new thread pool thread 6 : tl = null
	creating a new thread pool thread 7 : tl = null
	creating a new thread pool thread 8 : tl = null
	creating a new thread pool thread 9 : tl = null
	# filters out "apachespark" choice now
	# notice what happens with
	# * bad input ("floop")
	# * real user that can't be assigned a jira ("fakeimran")
	# * selection from list ("imran")
	# * arbitrary user that can be assigned ("vanzin")


	In [1]: from merge_spark_pr import *
	// paste in
	// https://gist.githubusercontent.com/squito/329d9cd82a21f645d592/raw/ca9217708293d6fe69ed6638f4feeb3038f8fd9c/reflector.scala

	val xCat = spark.sessionState.catalog.externalCatalog

	val catClient = get(xCat, "client")
	catClient.reflectMethod("getConf", Seq("hive.metastore.uris", ""))
	import org.apache.hadoop.fs.{FileSystem, Path}
	val fs = FileSystem.get(sc.hadoopConfiguration)
	> scala -Duser.timezone=America/Los_Angeles timestamp.scala
	Defaul TZ: America/Los_Angeles
	hours in UTC: 8
	TZ offset in hours: -8
	scala> def classExpandMacroImpl(c: Context)(s: c.Expr[Any]) : c.Expr[Any] = {
	\| import c.universe._
	\|
	\| val cdef = s.tree match {
	\| case Block(List(x:ClassDef), _) => x
	\| case _ => c.abort(c.enclosingPosition, "Was expecting a block w/ a ClassDef")
	\| }
	\|
	\| val q"class $name { ..$body }" = cdef
	\| val newdefs = List[c.universe.Tree](q"def x: Int = z + 7", q"def y: Float = z + 3.2f")