Last active
June 14, 2021 08:43
-
-
Save AdroitAnandAI/d9ee14053b5a4ff3b38cc883e222fea9 to your computer and use it in GitHub Desktop.
Get Timing Stats on Stanalone vs Rapids
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate Tuples and Count in Spark | |
import string | |
import random | |
from time import time | |
from pyspark.context import SparkContext | |
from pyspark.sql.session import SparkSession | |
if not 'sc' in globals(): | |
sc = SparkContext('local') | |
spark = SparkSession(sc) | |
S = 3 # number of characters in the string. | |
charCounts = [] | |
# To create tuple of size 'range' | |
for i in range (0, 10000000): | |
# call random.choices() string module to find the string in Uppercase | |
ranString = ''.join(random.choices(string.ascii_uppercase, k = S)) | |
ranCount = random.randint(1, 10) | |
charCounts.append((ranString, ranCount)) | |
# creating PairRDD x with key value pairs | |
start = time() | |
# Distribute the huge list into k partitions | |
x = sc.parallelize(charCounts, 8) | |
# Apply reduceByKey operation on x | |
y = x.reduceByKey(lambda accum, n: accum + n) | |
# To collect the data to one node. | |
output = y.collect() | |
print(f'Time taken to run: {time() - start} seconds') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment