kishanpython/pyspark_expr.ipynb

## pyspark_expr.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "pyspark_expr.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Use of 𝐞𝐱𝐩𝐫() in #pyspark ✍️\n",
        "\n",
        "𝐄𝐱𝐩𝐫():-\n",
        "====\n",
        "☑️ It is a SQL function in pyspark to 𝐞𝐱𝐞𝐜𝐮𝐭𝐞 𝐒𝐐𝐋-𝐥𝐢𝐤𝐞 𝐞𝐱𝐩𝐫𝐞𝐬𝐬𝐢𝐨𝐧𝐬.\n",
        "\n",
        "🔵 𝐒𝐲𝐧𝐭𝐚𝐱:- 𝐞𝐱𝐩𝐫(𝐬𝐭𝐫)\n",
        "\n",
        "☑️ It will take SQL expression as a 𝐬𝐭𝐫𝐢𝐧𝐠 𝐚𝐫𝐠𝐮𝐦𝐞𝐧𝐭 and performs the operations within the expression.\n",
        "\n",
        "☑️ It allows using SQL-like functions that are not present in PySpark Column type & pyspark.sql.functions API. Ex:- 𝐂𝐀𝐒𝐄 𝐖𝐇𝐄𝐍\n",
        "\n",
        "☑️ We are allowed to use 𝐃𝐚𝐭𝐚𝐅𝐫𝐚𝐦𝐞 𝐜𝐨𝐥𝐮𝐦𝐧𝐬 in the expression.\n",
        "\n",
        "\n",
        "Follow for more:- \n",
        "https://www.linkedin.com/in/kishanyadav/\n",
        "\n",
        "\n",
        "############......  H@@py Learning!!! .......##########"
      ],
      "metadata": {
        "id": "55i2WzI53ceu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# install pyspark in google colab\n",
        "!pip install pyspark"
      ],
      "metadata": {
        "id": "eq0lZ5biTCFe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# importing neccessary libs\n",
        "from pyspark.sql import SparkSession\n",
        "from pyspark.sql.functions import expr\n",
        "\n",
        "# creating session\n",
        "spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n",
        "\n",
        "# create dataframe\n",
        "data = [(\"Prashant\",\"Banglore\",25, 58, \"2022-08-01\", 1), (\"Ankit\",\"Banglore\",26, 54, \"2021-05-02\", 2),\n",
        "        (\"Ramakant\",\"Gurugram\",24, 60, \"2022-06-02\", 3), (\"Brijesh\",\"Gazipur\", 26, 75, \"2022-07-04\", 4),\n",
        "        (\"Devendra\",\"Gurugram\", 27, 62, \"2022-04-03\", 5), (\"Ajay\",\"Chandigarh\", 25, 72, \"2022-02-01\", 6)]\n",
        "columns= [\"friends_name\",\"location\", \"age\", \"weight\", \"meetup_date\", \"offset\"]\n",
        "df_friends = spark.createDataFrame(data = data, schema = columns)\n",
        "df_friends.show()\n",
        "\n",
        "# Concatenating One or More Columns\n",
        "# concate friends name, age and location columns using expr()\n",
        "df_concat = df_friends.withColumn(\"name-age-location\", expr(\"friends_name|| '-'|| age || '-' || location\"))\n",
        "df_concat.show()\n",
        "\n",
        "# Add a New Column Based on Conditions\n",
        "# check if exercise needed based on weight\n",
        "# if weight is more or equal to 60 -- Yes\n",
        "# if weight is less than 55 -- No\n",
        "# else \"Enjoy\"\n",
        "df_condition = df_friends.withColumn(\"Exercise_Need\", expr(\"CASE WHEN weight >= 60  THEN 'Yes' \" + \"WHEN  weight < 55  THEN 'No' ELSE 'Enjoy' END\"))\n",
        "df_condition.show()\n",
        "\n",
        "# Creating a new column using the existing column value inside the expression\n",
        "# let increment the meetup month by number of offset\n",
        "df_meetup = df_friends.withColumn(\"new_meetup_date\", expr(\"add_months(meetup_date,offset)\"))\n",
        "df_meetup.show()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6pSMJtnR2aJF",
        "outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "+----------+----------------+\n",
            "|state_name|state_population|\n",
            "+----------+----------------+\n",
            "|     Delhi|           30000|\n",
            "|    Mumbai|           50000|\n",
            "|    Gujrat|           80000|\n",
            "+----------+----------------+\n",
            "\n",
            "+----------+----------------+-------------------+\n",
            "|state_name|state_population|states_name_leftpad|\n",
            "+----------+----------------+-------------------+\n",
            "|Delhi     |30000           |#####Delhi         |\n",
            "|Mumbai    |50000           |####Mumbai         |\n",
            "|Gujrat    |80000           |####Gujrat         |\n",
            "+----------+----------------+-------------------+\n",
            "\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "|state_name|state_population|states_name_leftpad|states_name_rightpad|\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "|Delhi     |30000           |#####Delhi         |Delhi#####          |\n",
            "|Mumbai    |50000           |####Mumbai         |Mumbai####          |\n",
            "|Gujrat    |80000           |####Gujrat         |Gujrat####          |\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "|state_name|state_population|states_name_leftpad|states_name_rightpad|states_name_condition|\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "|Delhi     |30000           |#####Delhi         |Delhi#####          |Del                  |\n",
            "|Mumbai    |50000           |####Mumbai         |Mumbai####          |Mum                  |\n",
            "|Gujrat    |80000           |####Gujrat         |Gujrat####          |Guj                  |\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "\n"
          ]
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "pyspark_expr.ipynb",
	"provenance": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"# Use of 𝐞𝐱𝐩𝐫() in #pyspark ✍️\n",
	"\n",
	"𝐄𝐱𝐩𝐫():-\n",
	"====\n",
	"☑️ It is a SQL function in pyspark to 𝐞𝐱𝐞𝐜𝐮𝐭𝐞 𝐒𝐐𝐋-𝐥𝐢𝐤𝐞 𝐞𝐱𝐩𝐫𝐞𝐬𝐬𝐢𝐨𝐧𝐬.\n",
	"\n",
	"🔵 𝐒𝐲𝐧𝐭𝐚𝐱:- 𝐞𝐱𝐩𝐫(𝐬𝐭𝐫)\n",
	"\n",
	"☑️ It will take SQL expression as a 𝐬𝐭𝐫𝐢𝐧𝐠 𝐚𝐫𝐠𝐮𝐦𝐞𝐧𝐭 and performs the operations within the expression.\n",
	"\n",
	"☑️ It allows using SQL-like functions that are not present in PySpark Column type & pyspark.sql.functions API. Ex:- 𝐂𝐀𝐒𝐄 𝐖𝐇𝐄𝐍\n",
	"\n",
	"☑️ We are allowed to use 𝐃𝐚𝐭𝐚𝐅𝐫𝐚𝐦𝐞 𝐜𝐨𝐥𝐮𝐦𝐧𝐬 in the expression.\n",
	"\n",
	"\n",
	"Follow for more:- \n",
	"https://www.linkedin.com/in/kishanyadav/\n",
	"\n",
	"\n",
	"############...... H@@py Learning!!! .......##########"
	],
	"metadata": {
	"id": "55i2WzI53ceu"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# install pyspark in google colab\n",
	"!pip install pyspark"
	],
	"metadata": {
	"id": "eq0lZ5biTCFe"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# importing neccessary libs\n",
	"from pyspark.sql import SparkSession\n",
	"from pyspark.sql.functions import expr\n",
	"\n",
	"# creating session\n",
	"spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n",
	"\n",
	"# create dataframe\n",
	"data = [(\"Prashant\",\"Banglore\",25, 58, \"2022-08-01\", 1), (\"Ankit\",\"Banglore\",26, 54, \"2021-05-02\", 2),\n",
	" (\"Ramakant\",\"Gurugram\",24, 60, \"2022-06-02\", 3), (\"Brijesh\",\"Gazipur\", 26, 75, \"2022-07-04\", 4),\n",
	" (\"Devendra\",\"Gurugram\", 27, 62, \"2022-04-03\", 5), (\"Ajay\",\"Chandigarh\", 25, 72, \"2022-02-01\", 6)]\n",
	"columns= [\"friends_name\",\"location\", \"age\", \"weight\", \"meetup_date\", \"offset\"]\n",
	"df_friends = spark.createDataFrame(data = data, schema = columns)\n",
	"df_friends.show()\n",
	"\n",
	"# Concatenating One or More Columns\n",
	"# concate friends name, age and location columns using expr()\n",
	"df_concat = df_friends.withColumn(\"name-age-location\", expr(\"friends_name\|\| '-'\|\| age \|\| '-' \|\| location\"))\n",
	"df_concat.show()\n",
	"\n",
	"# Add a New Column Based on Conditions\n",
	"# check if exercise needed based on weight\n",
	"# if weight is more or equal to 60 -- Yes\n",
	"# if weight is less than 55 -- No\n",
	"# else \"Enjoy\"\n",
	"df_condition = df_friends.withColumn(\"Exercise_Need\", expr(\"CASE WHEN weight >= 60 THEN 'Yes' \" + \"WHEN weight < 55 THEN 'No' ELSE 'Enjoy' END\"))\n",
	"df_condition.show()\n",
	"\n",
	"# Creating a new column using the existing column value inside the expression\n",
	"# let increment the meetup month by number of offset\n",
	"df_meetup = df_friends.withColumn(\"new_meetup_date\", expr(\"add_months(meetup_date,offset)\"))\n",
	"df_meetup.show()"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "6pSMJtnR2aJF",
	"outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7"
	},
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"+----------+----------------+\n",
	"\|state_name\|state_population\|\n",
	"+----------+----------------+\n",
	"\| Delhi\| 30000\|\n",
	"\| Mumbai\| 50000\|\n",
	"\| Gujrat\| 80000\|\n",
	"+----------+----------------+\n",
	"\n",
	"+----------+----------------+-------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|\n",
	"+----------+----------------+-------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|\n",
	"+----------+----------------+-------------------+\n",
	"\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|states_name_rightpad\|\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|Delhi##### \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|Mumbai#### \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|Gujrat#### \|\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|states_name_rightpad\|states_name_condition\|\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|Delhi##### \|Del \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|Mumbai#### \|Mum \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|Gujrat#### \|Guj \|\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\n"
	]
	}
	]
	}
	]
	}