Skip to content

Instantly share code, notes, and snippets.

orderItems = sc.textFile("/public/retail_db/order_items")
type(orderItems)
help(orderItems)
orderItems.first()
for i in orderItems.take(10): print(i)
l = range(1, 10000)
lRDD = sc.parallelize(l)
productsRaw = open("/data/retail_db/products/part-00000").read().splitlines()
type(productsRaw)
productsRDD = sc.parallelize(productsRaw)
type(productsRDD)
productsRDD.first()
for i in productsRDD.take(10): print(i)
productsRDD.count()
#String Manipulation
orders = sc.textFile("/public/retail_db/orders")
s = orders.first()
#first character from a string
s[0]
#first 10 characters from a string
s[:10]
sqlContext.load("/public/retail_db_json/order_items", "json").show()
sqlContext.read.json("/public/retail_db_json/order_items").show()
#flatMap
linesList = ["How are you", "let us perform", "word count using flatMap", "to understand flatMap in detail"]
lines = sc.parallelize(linesList)
words = lines.flatMap(lambda l: l.split(" "))
tuples = words.map(lambda word: (word, 1))
for i in tuples.countByKey(): print(i)
#joins
orders = sc.textFile("/public/retail_db/orders")
orderItems = sc.textFile("/public/retail_db/order_items")
ordersMap = orders. \
map(lambda o:(int(o.split(",")[0]), o.split(",")[1]))
orderItemsMap = orderItems. \
map(lambda oi:(int(oi.split(",")[1]), float(oi.split(",")[4])))
#outer join
orders = sc.textFile("/public/retail_db/orders")
orderItems = sc.textFile("/public/retail_db/order_items")
ordersMap = orders. \
map(lambda o:(int(o.split(",")[0]), o.split(",")[3]))
orderItemsMap = orderItems. \
map(lambda oi:(int(oi.split(",")[1]), float(oi.split(",")[4])))
#Aggregations - total
orderItems = sc.textFile("/public/retail_db/order_items")
orderItems.count()
#Aggregations - total - Get revenue for given order_id
orderItems = sc.textFile("/public/retail_db/order_items")
orderItemsFiltered = orderItems. \
filter(lambda oi: int(oi.split(",")[1]) == 2)
orderItemsSubtotals = orderItemsFiltered. \
map(lambda oi: float(oi.split(",")[4]))
# Get order item details which have minimum order_item_subtotal for given order_id
orderItems = sc.textFile("/public/retail_db/order_items")
orderItemsFiltered = orderItems. \
filter(lambda oi: int(oi.split(",")[1]) == 2)
orderItemsFiltered. \
reduce(lambda x, y:
x if(float(x.split(",")[4]) < float(y.split(",")[4])) else y
)
#Get count by status - countByKey
orders = sc.textFile("/public/retail_db/orders")
ordersStatus = orders. \
map(lambda o: (o.split(",")[3], 1))
countByStatus = ordersStatus.countByKey()
for i in countByStatus: print(i)