Skip to content

Instantly share code, notes, and snippets.

@anbento0490
Last active December 26, 2022 13:00
Show Gist options
  • Save anbento0490/49313e47d4e56c7aad35f2baaefb1a8e to your computer and use it in GitHub Desktop.
Save anbento0490/49313e47d4e56c7aad35f2baaefb1a8e to your computer and use it in GitHub Desktop.
from pyspark.sql.functions import *
from pyspark.sql import functions as f
from pyspark.sql import Window
dataframe =(spark.read.option("header","true").csv("/FileStore/sales_5000000.csv"))
#dataframe.show()
df = dataframe.drop("Country", "Sales Channel", "Order ID", 'Ship Date', 'Units Sold', 'Unit Price', 'Unit Cost', 'Total Cost', 'Total Profit' ,'Order Priority')\
.withColumn('Order Date', to_date(col('Order Date'),'M/dd/yyyy'))\
.withColumn('Total Revenue',col('Total Revenue').cast('double'))\
.withColumn('Total Revenue',col('Total Revenue')/1000000)\
.withColumn('Total Revenue',f.round(col('Total Revenue'), 2))\
.withColumnRenamed('Total Revenue', 'Total Revenue (£M)')\
.orderBy('Order Date')
df.show(truncate=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment