Skip to content

Instantly share code, notes, and snippets.

@thanoojgithub
Last active January 21, 2022 05:01
Show Gist options
  • Save thanoojgithub/cc38e45eab0bf8f9e0c400bd1def24f0 to your computer and use it in GitHub Desktop.
Save thanoojgithub/cc38e45eab0bf8f9e0c400bd1def24f0 to your computer and use it in GitHub Desktop.
PySpark Example One
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import *
spark = SparkSession \
.builder \
.master('local') \
.appName('pyspark-test-run') \
.getOrCreate()
df = spark.read.format("csv").options(header='True', inferSchema='True', delimiter=',') \
.load("gs://bucket_27122021/data/travalData/airLineData/Airline_Details.csv")
df.write.mode("overwrite").parquet("gs://bucket_27122021/data/travalData/airLineData/parquet/airLineData.parquet")
df1 = spark.read.parquet("gs://bucket_27122021/data/travalData/airLineData/parquet/airLineData.parquet") \
.drop('CreatedTimestamp', 'UpdatedTimestamp')
print("df1 : ***********")
print("before cleansing airLine row count : ")
print(df1.count())
df1.show()
# AirlineID,AirlineName,Carrier,TailNum
df2 = df1.filter(col("AirlineID").isNotNull() & col("AirlineName").isNotNull() & col("Carrier").isNotNull() &
col("TailNum").isNotNull())
print("df2 : ***********")
df2.show()
# df3 = df2.dropDuplicates(["AirlineID", "AirlineName", "Carrier", "TailNum"])
# when, considering all columns in the table for distinct then, dropDuplicates() or specific columns then, above line
df31 = df2.dropDuplicates()
print("df31 : ***********")
df31.show()
# DeDuping using row_number if we have deciding column to pick one
win1 = Window.partitionBy('AirlineID', 'AirlineName', 'Carrier').orderBy(col('TailNum').desc())
df3 = df2.withColumn('rn', row_number().over(win1)).where('rn = 1').drop('rn')
print("df3 : ***********")
df3.show()
# df3 = df2.withColumn('startDate', date_format(current_timestamp(), 'yyyy-MM-dd'))
# current_date()
df4 = df31.withColumn("startDate", current_timestamp()) \
.withColumn("endDate", (unix_timestamp() + 999999999).cast('timestamp'))
print("df4 : ***********")
df4.show(truncate=False)
print("after cleansing airLine row count : ")
print(df4.count())
df1 : ***********
before cleansing airLine row count :
9878
+---------+---------------+-------+-------+
|AirlineID| AirlineName|Carrier|TailNum|
+---------+---------------+-------+-------+
| 19393|Alaska Airlines| WN| N288WN|
| 19393|Alaska Airlines| WN| N238WN|
| 19393|Alaska Airlines| WN| N8325D|
| 19393|Alaska Airlines| WN| N7719A|
| 19393|Alaska Airlines| WN| N775SW|
| 19393|Alaska Airlines| WN| N761RR|
| 19393|Alaska Airlines| WN| N8652B|
| 19393|Alaska Airlines| WN| N357SW|
| 19393|Alaska Airlines| WN| N290WN|
| 19393|Alaska Airlines| WN| N421LV|
| 19393|Alaska Airlines| WN| N665WN|
| 19393|Alaska Airlines| WN| N555LV|
| 19393|Alaska Airlines| WN| N633SW|
| 19393|Alaska Airlines| WN| N7711N|
| 19393|Alaska Airlines| WN| N439WN|
| 19393|Alaska Airlines| WN| N283WN|
| 19393|Alaska Airlines| WN| N246LV|
| 19393|Alaska Airlines| WN| N427WN|
| 19393|Alaska Airlines| WN| N399WN|
| 19393|Alaska Airlines| WN| N652SW|
+---------+---------------+-------+-------+
only showing top 20 rows
df2 : ***********
+---------+---------------+-------+-------+
|AirlineID| AirlineName|Carrier|TailNum|
+---------+---------------+-------+-------+
| 19393|Alaska Airlines| WN| N288WN|
| 19393|Alaska Airlines| WN| N238WN|
| 19393|Alaska Airlines| WN| N8325D|
| 19393|Alaska Airlines| WN| N7719A|
| 19393|Alaska Airlines| WN| N775SW|
| 19393|Alaska Airlines| WN| N761RR|
| 19393|Alaska Airlines| WN| N8652B|
| 19393|Alaska Airlines| WN| N357SW|
| 19393|Alaska Airlines| WN| N290WN|
| 19393|Alaska Airlines| WN| N421LV|
| 19393|Alaska Airlines| WN| N665WN|
| 19393|Alaska Airlines| WN| N555LV|
| 19393|Alaska Airlines| WN| N633SW|
| 19393|Alaska Airlines| WN| N7711N|
| 19393|Alaska Airlines| WN| N439WN|
| 19393|Alaska Airlines| WN| N283WN|
| 19393|Alaska Airlines| WN| N246LV|
| 19393|Alaska Airlines| WN| N427WN|
| 19393|Alaska Airlines| WN| N399WN|
| 19393|Alaska Airlines| WN| N652SW|
+---------+---------------+-------+-------+
only showing top 20 rows
df31 : ***********
+---------+-----------------+-------+-------+
|AirlineID| AirlineName|Carrier|TailNum|
+---------+-----------------+-------+-------+
| 19393| Alaska Airlines| WN| N784SW|
| 19790|American Airlines| DL| N971DL|
| 19790|American Airlines| DL| N911DA|
| 19790|American Airlines| DL| N991DL|
| 19790|American Airlines| DL| N306DN|
| 19790|American Airlines| DL| N863DN|
| 19805| Delta AirLines| AA| N438AA|
| 19805| Delta AirLines| AA| N3BGAA|
| 19805| Delta AirLines| AA| N7LNAA|
| 19805| Delta AirLines| AA| N8AFAA|
| 19805| Delta AirLines| AA| N521UW|
| 19805| Delta AirLines| AA| N8LLAA|
| 19977|Hawaiian Airlines| UA| N420UA|
| 19977|Hawaiian Airlines| UA| N15751|
| 19977|Hawaiian Airlines| UA| N69816|
| 19977|Hawaiian Airlines| UA| N76062|
| 19977|Hawaiian Airlines| UA| N220UA|
| 19977|Hawaiian Airlines| UA| N128UA|
| 20304| JetBlue Airlines| OO| N692CA|
| 20304| JetBlue Airlines| OO| N641CA|
+---------+-----------------+-------+-------+
only showing top 20 rows
df3 : ***********
+---------+------------------+-------+-------+
|AirlineID| AirlineName|Carrier|TailNum|
+---------+------------------+-------+-------+
| 19393| Alaska Airlines| WN| N969WN|
| 19690|Allegiant Airlines| HA| N594HA|
| 19790| American Airlines| DL| N999DN|
| 19805| Delta AirLines| AA| N998AA|
| 19930| Frontier Airlines| AS| N799AS|
| 19977| Hawaiian Airlines| UA| N896UA|
| 20304| JetBlue Airlines| OO| N988CA|
| 20366|Southwest Airlines| EV| N981EV|
| 20409| Spirit Airlines| B6| N990JB|
| 20416| United Airlines| NK| N905NK|
| 20436| ABC Airlines| F9| PLANET|
| 21171| XYZ Airlines| VX| N925VA|
+---------+------------------+-------+-------+
df4 : ***********
+---------+-----------------+-------+-------+-----------------------+-------------------+
|AirlineID|AirlineName |Carrier|TailNum|startDate |endDate |
+---------+-----------------+-------+-------+-----------------------+-------------------+
|19393 |Alaska Airlines |WN |N784SW |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19790 |American Airlines|DL |N971DL |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19790 |American Airlines|DL |N911DA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19790 |American Airlines|DL |N991DL |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19790 |American Airlines|DL |N306DN |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19790 |American Airlines|DL |N863DN |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N438AA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N3BGAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N7LNAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N8AFAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N521UW |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19805 |Delta AirLines |AA |N8LLAA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N420UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N15751 |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N69816 |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N76062 |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N220UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|19977 |Hawaiian Airlines|UA |N128UA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|20304 |JetBlue Airlines |OO |N692CA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
|20304 |JetBlue Airlines |OO |N641CA |2022-01-21 04:56:36.842|2053-09-29 06:43:15|
+---------+-----------------+-------+-------+-----------------------+-------------------+
only showing top 20 rows
after cleansing airLine row count :
5383
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment