Skip to content

Instantly share code, notes, and snippets.

@NitinKumar94
Created November 20, 2015 11:08
Show Gist options
  • Save NitinKumar94/072a4c33b65a916024c7 to your computer and use it in GitHub Desktop.
Save NitinKumar94/072a4c33b65a916024c7 to your computer and use it in GitHub Desktop.
Sample code for computing difference
def calc_diff():
os.environ['SPARK_HOME'] = "/usr/hdp/current/spark-client/bin"
conf = SparkConf().setMaster("local[*]").setAppName("TestApp")
sc = SparkContext(conf=conf)
sqlcontext = HiveContext(sc)
df = sqlcontext.createDataFrame(
[(1, "a", 23.0), (2, "B", 23.0), (3, "C", 33.0)], ("x1", "x2", "x3"))
r = df.select("x3")
rdd = r.rdd
rdd = rdd.map(lambda p: float(p.x3)).zipWithIndex().map(lambda (x, y): (y, x))
indexes = rdd.map(lambda x: x[0]).map(lambda x: (1, x)).keys().reduce(lambda x, y: x + y)
print "No of elements: ", indexes
df.show()
temp_list = sc.parallelize([])
for i in range(indexes - 1):
temp_list = temp_list.union(sc.parallelize([(rdd.lookup(i) + rdd.lookup(i + 1))]))
print temp_list.collect()
print temp_list.map(lambda x: x[0] + x[1]).collect()
result_rdd = temp_list.map(lambda x: x[0] - x[1])
print
parts = result_rdd.map(lambda p: Row(difference=p))
result_df = sqlcontext.createDataFrame(parts)
result_df.show()
print
sc.stop()
if __name__ == '__main__':
calc_diff()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment