fernando-mc/sparkcode.py

## sparkcode.py
# ./bin/pyspark
# Assumes headers are not present in files that are processed
# This is meant to run locally. To easily verify it works you can:
# 1. `pip install pyspark`
# 2. startup pyspark: ./bin/pyspark
# 3. copy data similar to the data provided above in the document
# 4. Copy and paste the code into the repl

def cleaner(s):
    return str(s).replace('"','').strip(' ')

def to_csv(d):
    return str(d[0][0])+','+str(d[0][1])+','+str(d[1])

# Get raw data from the data
rawAdPubData = sc.textFile('./data.csv')
# rawAdPubData = sc.textFile('./databig.csv')
# rawAdPubData = sc.textFile('./final.csv')


rawAdPubData.take(2)

# Split lines up
cleanAdPubData = rawAdPubData.map(lambda line: line.split(","))
cleanAdPubData.take(2)


# Remove the advertiser dimension and get a key value tuple in the form:
# ((publisher, datestring), sale_amount)
CleanPubData = cleanAdPubData.map(
    lambda row: ((cleaner(row[1]), cleaner(row[2])), float(row[3]))
)

CleanPubData.take(2)

# Calculate publisher earnings based on 25% of goods sold
publisherDailyPayments = CleanPubData.aggregateByKey(
    0,
    lambda a,b: a+b*.25,
    lambda c,d: c+d
)

publisherDailyPayments.take(2)

# Should be 2.5 because 25% of 10 is 2.5
# publisherDailyPayments.filter(lambda x: x[0][0]=='columntwo').collect()
	# ./bin/pyspark
	# Assumes headers are not present in files that are processed
	# This is meant to run locally. To easily verify it works you can:
	# 1. `pip install pyspark`
	# 2. startup pyspark: ./bin/pyspark
	# 3. copy data similar to the data provided above in the document
	# 4. Copy and paste the code into the repl

	def cleaner(s):
	return str(s).replace('"','').strip(' ')

	def to_csv(d):
	return str(d[0][0])+','+str(d[0][1])+','+str(d[1])

	# Get raw data from the data
	rawAdPubData = sc.textFile('./data.csv')
	# rawAdPubData = sc.textFile('./databig.csv')
	# rawAdPubData = sc.textFile('./final.csv')


	rawAdPubData.take(2)

	# Split lines up
	cleanAdPubData = rawAdPubData.map(lambda line: line.split(","))
	cleanAdPubData.take(2)


	# Remove the advertiser dimension and get a key value tuple in the form:
	# ((publisher, datestring), sale_amount)
	CleanPubData = cleanAdPubData.map(
	lambda row: ((cleaner(row[1]), cleaner(row[2])), float(row[3]))
	)

	CleanPubData.take(2)

	# Calculate publisher earnings based on 25% of goods sold
	publisherDailyPayments = CleanPubData.aggregateByKey(
	0,
	lambda a,b: a+b*.25,
	lambda c,d: c+d
	)

	publisherDailyPayments.take(2)

	# Should be 2.5 because 25% of 10 is 2.5
	# publisherDailyPayments.filter(lambda x: x[0][0]=='columntwo').collect()