Skip to content

Instantly share code, notes, and snippets.

@tabjy
Created March 15, 2019 08:22
Show Gist options
  • Save tabjy/d9dfb834bb0b95946a727f98d67bba59 to your computer and use it in GitHub Desktop.
Save tabjy/d9dfb834bb0b95946a727f98d67bba59 to your computer and use it in GitHub Desktop.
from pyspark import SparkConf, SparkContext
import sys
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+
# add more functions as necessary
def toTupples(line):
frags = line.split(" ")
return (frags[0], (frags[1], frags[2], frags[3], frags[4]))
def filterNonEn(entry):
return "en" in entry[1][0]
def filterMainPage(entry):
return "Main_Page" != entry[1][1]
def filtetrSpecials(entry):
return "Special:" != entry[1][1][:8]
def findMax (input):
entryList = input[1]
max = entryList[0]
for entry in entryList:
if int(entry[2]) > int(max[2]):
max = entry
return (input[0], max)
def toOutputStr(entry):
return entry[0] + " " +\
entry[1][2]
def main(inputs, output):
res = sc\
.textFile(inputs)\
.map(toTupples) \
.filter(filterNonEn) \
.filter(filterMainPage) \
.filter(filtetrSpecials) \
.groupByKey()\
.mapValues(list)\
.map(findMax) \
.map(toOutputStr) \
.saveAsTextFile(output)
if __name__ == '__main__':
conf = SparkConf().setAppName('Wikipedia Popular')
sc = SparkContext(conf=conf)
# sc.setLogLevel('WARN')
assert sc.version >= '2.3' # make sure we have Spark 2.3+
inputs = sys.argv[1]
output = sys.argv[2]
main(inputs, output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment