kittipatkampa

## convert_to_rdd_then_df.py
train_df = spark.createDataFrame(train_df.rdd, schema=train_df.schema)
model_pred = pipeline_pred.fit(train_df)

## put_cache_near_fit_solve_stackoverflow.py
### This may cause Py4JJavaError: An error occurred while calling o1019.fit.: java.lang.StackOverflowError
train_df = train_df.select(cols)
train_df.cache()
train_df.checkpoint()
train_df.show(n=3, truncate=False, vertical=True)

#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

model_pred = pipeline_pred.fit(train_df)

## using_checkpoint_in_spark.py
spark = SparkSession.builder \
    .appName("Confidence Model") \
    .enableHiveSupport() \
    .getOrCreate()

# I told spark to use dir called `checkpoint` to
# store checkpoints.
sc = spark.sparkContext
sc.setCheckpointDir('checkpoint')

## spark_udf_example.ipynb

      
        
          
            
              
              1 file
            
          
          
            
              
              0 forks
            
          
          
            
              
              0 comments
            
          
          
            
              
              0 stars
            
          
        
        
          
              
          
          
            
                kittipatkampa
                / spark_udf_example.ipynb
            
            
              Last active
              December 1, 2018 22:45
            
          
        
      
        
  
    
    

          
    
      
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	train_df = spark.createDataFrame(train_df.rdd, schema=train_df.schema)
	model_pred = pipeline_pred.fit(train_df)
	### This may cause Py4JJavaError: An error occurred while calling o1019.fit.: java.lang.StackOverflowError
	train_df = train_df.select(cols)
	train_df.cache()
	train_df.checkpoint()
	train_df.show(n=3, truncate=False, vertical=True)

	#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

	model_pred = pipeline_pred.fit(train_df)
	spark = SparkSession.builder \
	.appName("Confidence Model") \
	.enableHiveSupport() \
	.getOrCreate()

	# I told spark to use dir called `checkpoint` to
	# store checkpoints.
	sc = spark.sparkContext
	sc.setCheckpointDir('checkpoint')