input = spark .sparkContext .textFile ("file:///home/cluster/user27/data/20417-8.txt" )
grouped = input .flatMap (lambda line : line .split (' ' ))
from operator import add
result = grouped .map (lambda word : (word , 1 )).reduceByKey (add )
result .collect ()
input = spark .sparkContext .textFile ("file:///home/cluster/user27/data/in.txt" )
def toUrlTime (line ):
url ,time = line .split ("\t " )
return (url ,float (time ))
input .map (toUrlTime )
.map (lambda p : (p [0 ], (p [1 ], 1.0 ))
.reduceByKey (lambda p1 , p2 : (int (p1 [0 ]) + int (p2 [0 ]), p1 [1 ] + p2 [1 ]))
.mapValues (lambda v : v [0 ]/ v [1 ]).collect ()
SELECT DISTINCT name FROM Customer WHERE month(startDate)=7
input = spark .sparkContext .textFile ("file:///home/cluster/user27/data/Customer.txt" )
def toNameMonth (line ):
id , date , name = line .split ("," )
return (name , date .split ("/" )[1 ])
input .map (toNameMonth )
.filter (lambda p : p [1 ] == '07' )
.map (lambda p : p [0 ])
.distinct ()
.collect ()
SELECT C.cid, O.total FROM Customer C, Order O WHERE C.name LIKE ‘A%’ and C.cid=O.cid
inputCustomer = spark .sparkContext .textFile ("file:///home/cluster/user27/data/Customer.txt" )
inputOrder = spark .sparkContext .textFile ("file:///home/cluster/user27/data/Order.txt" )
filteredCustomers = inputCustomer .map (lambda line : (line .split ("," )[0 ], line .split ("," )[2 ]))
.filter (lambda p : p [1 ].startswith ("O" ))
mappedOrders = inputOrder .map (lambda line : (line .split ("," )[0 ], line .split ("," )[1 ]))
filteredCustomers .join (mappedOrders )
.map (lambda p : (p [0 ], p [1 ][1 ]))
.collect ()