anbento0490/pyspark_aggregation_0.py

## pyspark_aggregation_0.py
from pyspark.sql.functions import *
from pyspark.sql import functions as f
from pyspark.sql import Window

dataframe =(spark.read.option("header","true").csv("/FileStore/sales_5000000.csv"))
#dataframe.show()


df = dataframe.drop("Country", "Sales Channel", "Order ID", 'Ship Date', 'Units Sold', 'Unit Price', 'Unit Cost', 'Total Cost', 'Total Profit' ,'Order Priority')\
              .withColumn('Order Date', to_date(col('Order Date'),'M/dd/yyyy'))\
              .withColumn('Total Revenue',col('Total Revenue').cast('double'))\
              .withColumn('Total Revenue',col('Total Revenue')/1000000)\
              .withColumn('Total Revenue',f.round(col('Total Revenue'), 2))\
              .withColumnRenamed('Total Revenue', 'Total Revenue (£M)')\
              .orderBy('Order Date')


df.show(truncate=False)
	from pyspark.sql.functions import *
	from pyspark.sql import functions as f
	from pyspark.sql import Window

	dataframe =(spark.read.option("header","true").csv("/FileStore/sales_5000000.csv"))
	#dataframe.show()


	df = dataframe.drop("Country", "Sales Channel", "Order ID", 'Ship Date', 'Units Sold', 'Unit Price', 'Unit Cost', 'Total Cost', 'Total Profit' ,'Order Priority')\
	.withColumn('Order Date', to_date(col('Order Date'),'M/dd/yyyy'))\
	.withColumn('Total Revenue',col('Total Revenue').cast('double'))\
	.withColumn('Total Revenue',col('Total Revenue')/1000000)\
	.withColumn('Total Revenue',f.round(col('Total Revenue'), 2))\
	.withColumnRenamed('Total Revenue', 'Total Revenue (£M)')\
	.orderBy('Order Date')


	df.show(truncate=False)