kittipatkampa/put_cache_near_fit_solve_stackoverflow.py

## put_cache_near_fit_solve_stackoverflow.py
### This may cause Py4JJavaError: An error occurred while calling o1019.fit.: java.lang.StackOverflowError
train_df = train_df.select(cols)
train_df.cache()
train_df.checkpoint()
train_df.show(n=3, truncate=False, vertical=True)

#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

model_pred = pipeline_pred.fit(train_df)

### However, the problem above can be resolved by just moving
### cache() and show() right before .fit() like this:

train_df = train_df.select(cols)

#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

train_df.cache()
# Note that .checkpoint() is not even used here:
train_df.show(n=3, truncate=False, vertical=True)
model_pred = pipeline_pred.fit(train_df)
	### This may cause Py4JJavaError: An error occurred while calling o1019.fit.: java.lang.StackOverflowError
	train_df = train_df.select(cols)
	train_df.cache()
	train_df.checkpoint()
	train_df.show(n=3, truncate=False, vertical=True)

	#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

	model_pred = pipeline_pred.fit(train_df)

	### However, the problem above can be resolved by just moving
	### cache() and show() right before .fit() like this:

	train_df = train_df.select(cols)

	#... many cache() and .checkpoint() thingies in between, but not relevant to train_df at all

	train_df.cache()
	# Note that .checkpoint() is not even used here:
	train_df.show(n=3, truncate=False, vertical=True)
	model_pred = pipeline_pred.fit(train_df)