koushikmln/OrderItemsSpark.py

## OrderItemsSpark.py
# Use map to create an rdd of (order_id, sub_total) tuple.
rdd = sc.textFile("/public/retail_db/order_items/part-00000")
orderItemTuple = rdd.map(lambda x: (int(x.split(",")[1]), float(x.split(",")[4])))
orderItemTuple.take(10)

# Get total for particular order_id
orderItemTuple.filter(lambda x: x[0] == 2).reduce(lambda x, y: (x[0], x[1] + y[1]))

# Get order_id,total tuple
orderItemTuple.reduceByKey(lambda x, y: x + y).take(10)
	# Use map to create an rdd of (order_id, sub_total) tuple.
	rdd = sc.textFile("/public/retail_db/order_items/part-00000")
	orderItemTuple = rdd.map(lambda x: (int(x.split(",")[1]), float(x.split(",")[4])))
	orderItemTuple.take(10)

	# Get total for particular order_id
	orderItemTuple.filter(lambda x: x[0] == 2).reduce(lambda x, y: (x[0], x[1] + y[1]))

	# Get order_id,total tuple
	orderItemTuple.reduceByKey(lambda x, y: x + y).take(10)