Created
June 29, 2018 00:46
-
-
Save fernando-mc/598ab90a9d0efac12c8f299d98de82ed to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ./bin/pyspark | |
# Assumes headers are not present in files that are processed | |
# This is meant to run locally. To easily verify it works you can: | |
# 1. `pip install pyspark` | |
# 2. startup pyspark: ./bin/pyspark | |
# 3. copy data similar to the data provided above in the document | |
# 4. Copy and paste the code into the repl | |
def cleaner(s): | |
return str(s).replace('"','').strip(' ') | |
def to_csv(d): | |
return str(d[0][0])+','+str(d[0][1])+','+str(d[1]) | |
# Get raw data from the data | |
rawAdPubData = sc.textFile('./data.csv') | |
# rawAdPubData = sc.textFile('./databig.csv') | |
# rawAdPubData = sc.textFile('./final.csv') | |
rawAdPubData.take(2) | |
# Split lines up | |
cleanAdPubData = rawAdPubData.map(lambda line: line.split(",")) | |
cleanAdPubData.take(2) | |
# Remove the advertiser dimension and get a key value tuple in the form: | |
# ((publisher, datestring), sale_amount) | |
CleanPubData = cleanAdPubData.map( | |
lambda row: ((cleaner(row[1]), cleaner(row[2])), float(row[3])) | |
) | |
CleanPubData.take(2) | |
# Calculate publisher earnings based on 25% of goods sold | |
publisherDailyPayments = CleanPubData.aggregateByKey( | |
0, | |
lambda a,b: a+b*.25, | |
lambda c,d: c+d | |
) | |
publisherDailyPayments.take(2) | |
# Should be 2.5 because 25% of 10 is 2.5 | |
# publisherDailyPayments.filter(lambda x: x[0][0]=='columntwo').collect() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment