Kolin Ohi kjohi

## hash_rdd.py
from pyspark.shuffle import ExternalSorter
from pyspark.rdd import _parse_memory

def hash_rdd(rdd, id_func=lambda el: repr(el), hash_function='sha256', num_partitions=200):
    """
    This function returns a unique hash representing all records in the rdd. Order of items doesn't affect the hash.

    params:
    id_func - a function that gets a unique string identifier from an rdd element. If none is specified, use repr(element)
    hash_function - is the name of the hashlib algorithm you want to use.
	from pyspark.shuffle import ExternalSorter
	from pyspark.rdd import _parse_memory

	def hash_rdd(rdd, id_func=lambda el: repr(el), hash_function='sha256', num_partitions=200):
	"""
	This function returns a unique hash representing all records in the rdd. Order of items doesn't affect the hash.

	params:
	id_func - a function that gets a unique string identifier from an rdd element. If none is specified, use repr(element)
	hash_function - is the name of the hashlib algorithm you want to use.