Last active
June 26, 2024 20:46
-
-
Save kishanpython/807c65b1ebfa9fdde6d735d0395e8dcd to your computer and use it in GitHub Desktop.
Uses of expr in pyspark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "pyspark_expr.ipynb", | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Use of ๐๐ฑ๐ฉ๐ซ() in #pyspark โ๏ธ\n", | |
"\n", | |
"๐๐ฑ๐ฉ๐ซ():-\n", | |
"====\n", | |
"โ๏ธ It is a SQL function in pyspark to ๐๐ฑ๐๐๐ฎ๐ญ๐ ๐๐๐-๐ฅ๐ข๐ค๐ ๐๐ฑ๐ฉ๐ซ๐๐ฌ๐ฌ๐ข๐จ๐ง๐ฌ.\n", | |
"\n", | |
"๐ต ๐๐ฒ๐ง๐ญ๐๐ฑ:- ๐๐ฑ๐ฉ๐ซ(๐ฌ๐ญ๐ซ)\n", | |
"\n", | |
"โ๏ธ It will take SQL expression as a ๐ฌ๐ญ๐ซ๐ข๐ง๐ ๐๐ซ๐ ๐ฎ๐ฆ๐๐ง๐ญ and performs the operations within the expression.\n", | |
"\n", | |
"โ๏ธ It allows using SQL-like functions that are not present in PySpark Column type & pyspark.sql.functions API. Ex:- ๐๐๐๐ ๐๐๐๐\n", | |
"\n", | |
"โ๏ธ We are allowed to use ๐๐๐ญ๐๐ ๐ซ๐๐ฆ๐ ๐๐จ๐ฅ๐ฎ๐ฆ๐ง๐ฌ in the expression.\n", | |
"\n", | |
"\n", | |
"Follow for more:- \n", | |
"https://www.linkedin.com/in/kishanyadav/\n", | |
"\n", | |
"\n", | |
"############...... H@@py Learning!!! .......##########" | |
], | |
"metadata": { | |
"id": "55i2WzI53ceu" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# install pyspark in google colab\n", | |
"!pip install pyspark" | |
], | |
"metadata": { | |
"id": "eq0lZ5biTCFe" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# importing neccessary libs\n", | |
"from pyspark.sql import SparkSession\n", | |
"from pyspark.sql.functions import expr\n", | |
"\n", | |
"# creating session\n", | |
"spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n", | |
"\n", | |
"# create dataframe\n", | |
"data = [(\"Prashant\",\"Banglore\",25, 58, \"2022-08-01\", 1), (\"Ankit\",\"Banglore\",26, 54, \"2021-05-02\", 2),\n", | |
" (\"Ramakant\",\"Gurugram\",24, 60, \"2022-06-02\", 3), (\"Brijesh\",\"Gazipur\", 26, 75, \"2022-07-04\", 4),\n", | |
" (\"Devendra\",\"Gurugram\", 27, 62, \"2022-04-03\", 5), (\"Ajay\",\"Chandigarh\", 25, 72, \"2022-02-01\", 6)]\n", | |
"columns= [\"friends_name\",\"location\", \"age\", \"weight\", \"meetup_date\", \"offset\"]\n", | |
"df_friends = spark.createDataFrame(data = data, schema = columns)\n", | |
"df_friends.show()\n", | |
"\n", | |
"# Concatenating One or More Columns\n", | |
"# concate friends name, age and location columns using expr()\n", | |
"df_concat = df_friends.withColumn(\"name-age-location\", expr(\"friends_name|| '-'|| age || '-' || location\"))\n", | |
"df_concat.show()\n", | |
"\n", | |
"# Add a New Column Based on Conditions\n", | |
"# check if exercise needed based on weight\n", | |
"# if weight is more or equal to 60 -- Yes\n", | |
"# if weight is less than 55 -- No\n", | |
"# else \"Enjoy\"\n", | |
"df_condition = df_friends.withColumn(\"Exercise_Need\", expr(\"CASE WHEN weight >= 60 THEN 'Yes' \" + \"WHEN weight < 55 THEN 'No' ELSE 'Enjoy' END\"))\n", | |
"df_condition.show()\n", | |
"\n", | |
"# Creating a new column using the existing column value inside the expression\n", | |
"# let increment the meetup month by number of offset\n", | |
"df_meetup = df_friends.withColumn(\"new_meetup_date\", expr(\"add_months(meetup_date,offset)\"))\n", | |
"df_meetup.show()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "6pSMJtnR2aJF", | |
"outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"+----------+----------------+\n", | |
"|state_name|state_population|\n", | |
"+----------+----------------+\n", | |
"| Delhi| 30000|\n", | |
"| Mumbai| 50000|\n", | |
"| Gujrat| 80000|\n", | |
"+----------+----------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+\n", | |
"|state_name|state_population|states_name_leftpad|\n", | |
"+----------+----------------+-------------------+\n", | |
"|Delhi |30000 |#####Delhi |\n", | |
"|Mumbai |50000 |####Mumbai |\n", | |
"|Gujrat |80000 |####Gujrat |\n", | |
"+----------+----------------+-------------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"|state_name|state_population|states_name_leftpad|states_name_rightpad|\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"|Delhi |30000 |#####Delhi |Delhi##### |\n", | |
"|Mumbai |50000 |####Mumbai |Mumbai#### |\n", | |
"|Gujrat |80000 |####Gujrat |Gujrat#### |\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"|state_name|state_population|states_name_leftpad|states_name_rightpad|states_name_condition|\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"|Delhi |30000 |#####Delhi |Delhi##### |Del |\n", | |
"|Mumbai |50000 |####Mumbai |Mumbai#### |Mum |\n", | |
"|Gujrat |80000 |####Gujrat |Gujrat#### |Guj |\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment