Skip to content

Instantly share code, notes, and snippets.

@tjaskula
Last active November 1, 2018 22:10
Show Gist options
  • Save tjaskula/2739682fbe2f34a3ecf2285264291d7f to your computer and use it in GitHub Desktop.
Save tjaskula/2739682fbe2f34a3ecf2285264291d7f to your computer and use it in GitHub Desktop.
PySpark

Word Count

input = spark.sparkContext.textFile("file:///home/cluster/user27/data/20417-8.txt")
grouped = input.flatMap(lambda line: line.split(' '))
from operator import add
result = grouped.map(lambda word: (word, 1)).reduceByKey(add)
result.collect()

Average

input = spark.sparkContext.textFile("file:///home/cluster/user27/data/in.txt")
def toUrlTime(line):
  url,time=line.split("\t")
  return (url,float(time))

input.map(toUrlTime)
     .map(lambda p: (p[0], (p[1], 1.0))
     .reduceByKey(lambda p1, p2: (int(p1[0]) + int(p2[0]), p1[1] + p2[1]))
     .mapValues(lambda v: v[0]/v[1]).collect()

SELECT DISTINCT name FROM Customer WHERE month(startDate)=7

input = spark.sparkContext.textFile("file:///home/cluster/user27/data/Customer.txt")
def toNameMonth(line):
  id, date, name = line.split(",")
  return (name, date.split("/")[1])

input.map(toNameMonth)
     .filter(lambda p: p[1] == '07')
     .map(lambda p: p[0])
     .distinct()
     .collect()

SELECT C.cid, O.total FROM Customer C, Order O WHERE C.name LIKE ‘A%’ and C.cid=O.cid

inputCustomer = spark.sparkContext.textFile("file:///home/cluster/user27/data/Customer.txt")
inputOrder = spark.sparkContext.textFile("file:///home/cluster/user27/data/Order.txt")
filteredCustomers = inputCustomer.map(lambda line: (line.split(",")[0], line.split(",")[2]))
                                 .filter(lambda p: p[1].startswith("O"))
mappedOrders = inputOrder.map(lambda line: (line.split(",")[0], line.split(",")[1]))
filteredCustomers.join(mappedOrders)
                 .map(lambda p: (p[0], p[1][1]))
                 .collect()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment