Skip to content

Instantly share code, notes, and snippets.

@asimihsan
Created November 29, 2015 23:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asimihsan/5fdb080139b1bc36bdf5 to your computer and use it in GitHub Desktop.
Save asimihsan/5fdb080139b1bc36bdf5 to your computer and use it in GitHub Desktop.
import math
students = sc.parallelize([
[100, "Alice", 8.5, "Computer Science"],
[101, "Bob", 7.1, "Engineering"],
[102, "Carl", 6.2, "Engineering"],
])
def extract_degree_grade_count(row):
return (row[3], (row[2], row[2] ** 2, 1))
degree_grade_count_RDD = students.map(extract_degree_grade_count)
#print(degree_grade_count_RDD.collect())
#[('Computer Science', (8.5, 72.25, 1)), ('Engineering', (7.1, 50.41, 1)), ('Engineering', (6.2, 38.440000000000005, 1))]
def reducer(row1, row2):
return (row1[0] + row2[0],
(row1[1] + row2[1]),
row1[2] + row2[2])
reduced_RDD = degree_grade_count_RDD.reduceByKey(reducer)
#print(reduced_RDD.collect())
#[('Engineering', (13.3, 88.85, 2)), ('Computer Science', (8.5, 72.25, 1))]
def mean_stddev(row):
total = float(row[1][0])
sum_squares = float(row[1][1])
n = float(row[1][2])
mean = total / n
# this is the population standard deviation
stddev = math.sqrt(sum_squares / n - mean ** 2)
return (row[0], mean, stddev)
mean_stddev_RDD = reduced_RDD.map(mean_stddev)
print(mean_stddev_RDD.collect())
#[('Engineering', 6.65, 0.44999999999999274), ('Computer Science', 8.5, 0.0)]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment