AdroitAnandAI/timeonRapids.py

## timeonRapids.py
# Generate Tuples and Count in Spark
import string
import random
from time import time
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

if not 'sc' in globals():
    sc = SparkContext('local')
    spark = SparkSession(sc)

S = 3  # number of characters in the string.
charCounts = []

# To create tuple of size 'range'
for i in range (0, 10000000):
    # call random.choices() string module to find the string in Uppercase
    ranString = ''.join(random.choices(string.ascii_uppercase, k = S))

    ranCount = random.randint(1, 10)
    charCounts.append((ranString, ranCount))

# creating PairRDD x with key value pairs

start = time()

# Distribute the huge list into k partitions
x = sc.parallelize(charCounts, 8)

# Apply reduceByKey operation on x
y = x.reduceByKey(lambda accum, n: accum + n)

# To collect the data to one node.
output = y.collect()

print(f'Time taken to run: {time() - start} seconds')
	# Generate Tuples and Count in Spark
	import string
	import random
	from time import time
	from pyspark.context import SparkContext
	from pyspark.sql.session import SparkSession

	if not 'sc' in globals():
	sc = SparkContext('local')
	spark = SparkSession(sc)

	S = 3 # number of characters in the string.
	charCounts = []

	# To create tuple of size 'range'
	for i in range (0, 10000000):
	# call random.choices() string module to find the string in Uppercase
	ranString = ''.join(random.choices(string.ascii_uppercase, k = S))

	ranCount = random.randint(1, 10)
	charCounts.append((ranString, ranCount))

	# creating PairRDD x with key value pairs

	start = time()

	# Distribute the huge list into k partitions
	x = sc.parallelize(charCounts, 8)

	# Apply reduceByKey operation on x
	y = x.reduceByKey(lambda accum, n: accum + n)

	# To collect the data to one node.
	output = y.collect()

	print(f'Time taken to run: {time() - start} seconds')