jose-goncabel/multiple-gpu-example.py

## multiple-gpu-example.py
#####
# IMPORTS
#####
from pyspark import TaskContext
import os

#####
# PATHS
#####
path_model = "/path/to/pretrained/model.h5"
path_rdd = "/path/to/rdd"

#####
# STATIC VARIABLES
#####
gpus_available = 4

#####
# RDD Creation - input_rdd
#####
input_rdd = sc.textFile(path_rdd)

#####
# Predict for partition function
#####
def predict_for_partition(partition):
  # Get the id of the current partition
  # The id will be used to assign work to one of the GPUS
  ctx = TaskContext.get()
  current_partition = ctx.partitionId()

  # set enviornment
  os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
  os.environ["CUDA_VISIBLE_DEVICES"] = str(current_partition)

  # Load the keras model
  model = load_model(path_model)

  # Iterate over the rows on the partition and predict
  for row in partition:
    prediction = model.predict(row)
    yield prediction

# Force a repartition based on the amount of GPUS
input_rdd = input_rdd.repartition(gpus_available)

# Call map partitions to obtain the predictions
input_rdd = input_rdd.mapPartitions(predict_for_partition)
	#####
	# IMPORTS
	#####
	from pyspark import TaskContext
	import os

	#####
	# PATHS
	#####
	path_model = "/path/to/pretrained/model.h5"
	path_rdd = "/path/to/rdd"

	#####
	# STATIC VARIABLES
	#####
	gpus_available = 4

	#####
	# RDD Creation - input_rdd
	#####
	input_rdd = sc.textFile(path_rdd)

	#####
	# Predict for partition function
	#####
	def predict_for_partition(partition):
	# Get the id of the current partition
	# The id will be used to assign work to one of the GPUS
	ctx = TaskContext.get()
	current_partition = ctx.partitionId()

	# set enviornment
	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = str(current_partition)

	# Load the keras model
	model = load_model(path_model)

	# Iterate over the rows on the partition and predict
	for row in partition:
	prediction = model.predict(row)
	yield prediction

	# Force a repartition based on the amount of GPUS
	input_rdd = input_rdd.repartition(gpus_available)

	# Call map partitions to obtain the predictions
	input_rdd = input_rdd.mapPartitions(predict_for_partition)