jovianlin/pyspark_threads.py

## pyspark_threads.py
# Soure http://stackoverflow.com/questions/30214474/how-to-run-multiple-jobs-in-one-sparkcontext-from-separate-threads-in-pyspark
# Prereqs:
# set
# spark.dynamicAllocation.enabled         true
# spark.shuffle.service.enabled           true
# in spark-defaults.conf

import threading
from pyspark import SparkContext, SparkConf

def task(sc, i):
  print sc.parallelize(range(i*10000)).count()

def run_multiple_jobs():
  conf = SparkConf().setMaster('local[*]').setAppName('appname')
  # Set scheduler to FAIR: http://spark.apache.org/docs/latest/job-scheduling.html#scheduling-within-an-application
  conf.set('spark.scheduler.mode', 'FAIR')
  sc = SparkContext(conf=conf)
  for i in range(4):
    t = threading.Thread(target=task, args=(sc, i))
    t.start()
    print 'spark task', i, 'has started'


run_multiple_jobs()

# OUTPUT:
# spark task 0 has started
# spark task 1 has started
# spark task 2 has started
# spark task 3 has started
# 30000
# 0
# 10000
# 20000
	# Soure http://stackoverflow.com/questions/30214474/how-to-run-multiple-jobs-in-one-sparkcontext-from-separate-threads-in-pyspark
	# Prereqs:
	# set
	# spark.dynamicAllocation.enabled true
	# spark.shuffle.service.enabled true
	# in spark-defaults.conf

	import threading
	from pyspark import SparkContext, SparkConf

	def task(sc, i):
	print sc.parallelize(range(i*10000)).count()

	def run_multiple_jobs():
	conf = SparkConf().setMaster('local[*]').setAppName('appname')
	# Set scheduler to FAIR: http://spark.apache.org/docs/latest/job-scheduling.html#scheduling-within-an-application
	conf.set('spark.scheduler.mode', 'FAIR')
	sc = SparkContext(conf=conf)
	for i in range(4):
	t = threading.Thread(target=task, args=(sc, i))
	t.start()
	print 'spark task', i, 'has started'


	run_multiple_jobs()

	# OUTPUT:
	# spark task 0 has started
	# spark task 1 has started
	# spark task 2 has started
	# spark task 3 has started
	# 30000
	# 0
	# 10000
	# 20000