Skip to content

Instantly share code, notes, and snippets.

@dharma6872
Created January 13, 2021 05:51
Show Gist options
  • Save dharma6872/889daacc769ba411e2ea0e6dd6c1ea16 to your computer and use it in GitHub Desktop.
Save dharma6872/889daacc769ba411e2ea0e6dd6c1ea16 to your computer and use it in GitHub Desktop.
[Create First RDD] RDD 생성 #pyspark #pyspark101
# Importing Spark Related Packages
from pyspark.sql import SparkSession
if __name__ == "__main__":
print("PySpark 101 Tutorial")
print("Part 2. Create First RDD(Resilient Distributed Dataset) in PySpark using PyCharm IDE")
spark = SparkSession \
.builder \
.appName("Part 2. Create First RDD(Resilient Distributed Dataset) in PySpark using PyCharm IDE") \
.master("local[*]") \
.enableHiveSupport() \
.getOrCreate()
py_number_list = [1,2,3,4,5]
print("Printing Python Number List: ")
print(py_number_list)
print(type(py_number_list))
print("Creating First RDD from Python Number List")
# 숫자 3의 의미는 파티션을 갯수
number_rdd = spark.sparkContext.parallelize(py_number_list, 3)
print(type(number_rdd))
py_str_list = ["Arun", "Arvind", "Arjun", "Anna"]
print(py_str_list)
str_rdd = spark.sparkContext.parallelize(py_str_list, 2)
print(type(str_rdd))
str_rdd_output = str_rdd.collect()
print("Printing Output str_rdd: ")
print(str_rdd_output)
print("Stopping the SparkSession object")
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment