Created
March 15, 2019 08:22
-
-
Save tabjy/d9dfb834bb0b95946a727f98d67bba59 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark import SparkConf, SparkContext | |
import sys | |
assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ | |
# add more functions as necessary | |
def toTupples(line): | |
frags = line.split(" ") | |
return (frags[0], (frags[1], frags[2], frags[3], frags[4])) | |
def filterNonEn(entry): | |
return "en" in entry[1][0] | |
def filterMainPage(entry): | |
return "Main_Page" != entry[1][1] | |
def filtetrSpecials(entry): | |
return "Special:" != entry[1][1][:8] | |
def findMax (input): | |
entryList = input[1] | |
max = entryList[0] | |
for entry in entryList: | |
if int(entry[2]) > int(max[2]): | |
max = entry | |
return (input[0], max) | |
def toOutputStr(entry): | |
return entry[0] + " " +\ | |
entry[1][2] | |
def main(inputs, output): | |
res = sc\ | |
.textFile(inputs)\ | |
.map(toTupples) \ | |
.filter(filterNonEn) \ | |
.filter(filterMainPage) \ | |
.filter(filtetrSpecials) \ | |
.groupByKey()\ | |
.mapValues(list)\ | |
.map(findMax) \ | |
.map(toOutputStr) \ | |
.saveAsTextFile(output) | |
if __name__ == '__main__': | |
conf = SparkConf().setAppName('Wikipedia Popular') | |
sc = SparkContext(conf=conf) | |
# sc.setLogLevel('WARN') | |
assert sc.version >= '2.3' # make sure we have Spark 2.3+ | |
inputs = sys.argv[1] | |
output = sys.argv[2] | |
main(inputs, output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment