joshlk/leak_toy_example.py

## leak_toy_example.py
from pyspark import SparkContext
from pyspark.sql import SQLContext
import numpy as np

sc = SparkContext()
sqlContext = SQLContext(sc)

# Create dummy pySpark DataFrame with 1e5 rows and 16 partitions
df = sqlContext.range(0, int(1e5), numPartitions=16)

def toy_example(rdd):

    # Read in pySpark DataFrame partition
    data = list(rdd)

    # Generate random data using Numpy
    rand_data = np.random.random(int(1e7))

    # Apply the `int` function to each element of `rand_data`
    for i in range(len(rand_data)):
        e = rand_data[i]
        int(e)

    # Return a single `0` value
    return [[0]]

# Execute the above function on each partition (16 partitions)
result = df.rdd.mapPartitions(toy_example)
result = result.collect()
	from pyspark import SparkContext
	from pyspark.sql import SQLContext
	import numpy as np

	sc = SparkContext()
	sqlContext = SQLContext(sc)

	# Create dummy pySpark DataFrame with 1e5 rows and 16 partitions
	df = sqlContext.range(0, int(1e5), numPartitions=16)

	def toy_example(rdd):

	# Read in pySpark DataFrame partition
	data = list(rdd)

	# Generate random data using Numpy
	rand_data = np.random.random(int(1e7))

	# Apply the `int` function to each element of `rand_data`
	for i in range(len(rand_data)):
	e = rand_data[i]
	int(e)

	# Return a single `0` value
	return [[0]]

	# Execute the above function on each partition (16 partitions)
	result = df.rdd.mapPartitions(toy_example)
	result = result.collect()