The Big Data Bravura: Introducing Apache Spark
from pyspark import SparkConf, SparkContext | |
conf = SparkConf().setMaster("local").setAppName("SubjectsByClass") | |
sc = SparkContext(conf=conf) | |
def get_class_and_subject(entry): | |
fields = entry.split(',') | |
class_id = fields[1] | |
number_of_subjects = int(fields[2]) | |
return (class_id, number_of_subjects) | |
student_subject_rdd = sc.textFile("./student_subject.csv") | |
class_subject_rdd = student_subject_rdd.map(get_class_and_subject) | |
modified_class_subject_rdd = class_subject_rdd.mapValues( | |
lambda value: (value, 1)) | |
total_by_class_rdd = modified_class_subject_rdd.reduceByKey( | |
lambda currentValue, nextvalue: ( | |
currentValue[0] + nextvalue[0], currentValue[1] + nextvalue[1])) | |
average_by_class_rdd = total_by_class_rdd.mapValues( | |
lambda value: value[0] / value[1]) | |
average_by_class_array = average_by_class_rdd.collect() | |
for result in average_by_class_array: | |
print('The average number of subjects for class ', | |
result[0], ' is ', f'{result[1]:.2f}') |
s101 | c201 | 10 | |
---|---|---|---|
s102 | c202 | 8 | |
s103 | c201 | 7 | |
s104 | c201 | 9 | |
s105 | c202 | 10 | |
s106 | c201 | 8 | |
s107 | c202 | 7 | |
s108 | c201 | 9 | |
s201 | c203 | 7 | |
s202 | c203 | 8 | |
s203 | c205 | 8 | |
s204 | c203 | 8 | |
s205 | c205 | 7 | |
s206 | c203 | 10 | |
s207 | c203 | 8 | |
s208 | c205 | 9 | |
s302 | c203 | 8 | |
s304 | c205 | 8 | |
s905 | c205 | 9 | |
s904 | c203 | 6 | |
s801 | c201 | 7 | |
s400 | c204 | 10 | |
s807 | c204 | 8 | |
s803 | c201 | 9 | |
s802 | c201 | 9 | |
s709 | c202 | 7 | |
s605 | c205 | 10 | |
s603 | c203 | 8 | |
s602 | c205 | 7 | |
s509 | c204 | 7 | |
s507 | c204 | 6 | |
s505 | c205 | 7 | |
s504 | c205 | 8 | |
s503 | c205 | 10 | |
s502 | c202 | 9 | |
s501 | c203 | 9 | |
s409 | c203 | 8 | |
s408 | c205 | 11 | |
s407 | c202 | 10 | |
s406 | c204 | 7 | |
s405 | c205 | 8 | |
s404 | c203 | 9 | |
s403 | c205 | 6 | |
s402 | c201 | 5 | |
s401 | c202 | 6 | |
s309 | c201 | 8 | |
s308 | c202 | 7 | |
s306 | c205 | 10 | |
s307 | c201 | 11 | |
s305 | c203 | 11 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment