kishanpython/pyspark_paddding.ipynb

## pyspark_paddding.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "pyspark_paddding.ipynb",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# Functions to add padding to a column in #PySpark ✍️\n",
        "\n",
        "𝐥𝐩𝐚𝐝() -\n",
        "====\n",
        "👉In order to add padding to the left side of the column we use 𝐥𝐩𝐚𝐝() function.\n",
        "\n",
        "👉This function takes column name ,length and padding string as arguments.\n",
        "\n",
        "𝐫𝐩𝐚𝐝() -\n",
        "=====\n",
        "👉To add padding to the right side of the column we use 𝐫𝐩𝐚𝐝() function.\n",
        "\n",
        "👉This function also takes column name ,length and padding string as arguments. Same as lpad.\n",
        "\n",
        "==============================\n",
        "\n",
        "𝐋𝐞𝐟𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐥𝐩𝐚𝐝()\n",
        "\n",
        "𝐑𝐢𝐠𝐡𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐫𝐩𝐚𝐝()\n",
        "\n",
        "==============================\n",
        "\n",
        "\n",
        "Note:-\n",
        "====\n",
        "⚠️ If column value is longer than specified length, the return value will be shortened to length characters or bytes.\n",
        "\n",
        "⚠️ If padding value is not specified, then column value will be padded to the left or right depending on the function you are using, with space characters if it is a character string, and with zeros if it is a byte sequence.\n",
        "\n",
        "\n",
        "\n",
        "Follow for more:- \n",
        "https://www.linkedin.com/in/kishanyadav/\n",
        "\n",
        "############......  H@@py Learning!!! .......##########"
      ],
      "metadata": {
        "id": "55i2WzI53ceu"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# install pyspark in google colab\n",
        "!pip install pyspark"
      ],
      "metadata": {
        "id": "eq0lZ5biTCFe"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# importing neccessary libs\n",
        "from pyspark.sql import SparkSession\n",
        "from pyspark.sql.functions import col, lpad, rpad\n",
        "\n",
        "# creating session\n",
        "spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n",
        "\n",
        "# creating dataframe\n",
        "data = [(\"Delhi\",30000),(\"Mumbai\",50000),(\"Gujrat\",80000)]\n",
        "columns= [\"state_name\",\"state_population\"]\n",
        "df_states = spark.createDataFrame(data = data, schema = columns)\n",
        "df_states.show()\n",
        "\n",
        "# left padding\n",
        "df_states = df_states.withColumn('states_name_leftpad', lpad(col(\"state_name\"), 10, '#'))\n",
        "df_states.show(truncate =False)\n",
        "\n",
        "# right padding\n",
        "df_states = df_states.withColumn('states_name_rightpad', rpad(col(\"state_name\"), 10, '#'))\n",
        "df_states.show(truncate =False)\n",
        "\n",
        "# when the column string is longer than the padded string length  \n",
        "df_states = df_states.withColumn('states_name_condition', lpad(col(\"state_name\"), 3, '#'))\n",
        "df_states.show(truncate =False)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "6pSMJtnR2aJF",
        "outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "+----------+----------------+\n",
            "|state_name|state_population|\n",
            "+----------+----------------+\n",
            "|     Delhi|           30000|\n",
            "|    Mumbai|           50000|\n",
            "|    Gujrat|           80000|\n",
            "+----------+----------------+\n",
            "\n",
            "+----------+----------------+-------------------+\n",
            "|state_name|state_population|states_name_leftpad|\n",
            "+----------+----------------+-------------------+\n",
            "|Delhi     |30000           |#####Delhi         |\n",
            "|Mumbai    |50000           |####Mumbai         |\n",
            "|Gujrat    |80000           |####Gujrat         |\n",
            "+----------+----------------+-------------------+\n",
            "\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "|state_name|state_population|states_name_leftpad|states_name_rightpad|\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "|Delhi     |30000           |#####Delhi         |Delhi#####          |\n",
            "|Mumbai    |50000           |####Mumbai         |Mumbai####          |\n",
            "|Gujrat    |80000           |####Gujrat         |Gujrat####          |\n",
            "+----------+----------------+-------------------+--------------------+\n",
            "\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "|state_name|state_population|states_name_leftpad|states_name_rightpad|states_name_condition|\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "|Delhi     |30000           |#####Delhi         |Delhi#####          |Del                  |\n",
            "|Mumbai    |50000           |####Mumbai         |Mumbai####          |Mum                  |\n",
            "|Gujrat    |80000           |####Gujrat         |Gujrat####          |Guj                  |\n",
            "+----------+----------------+-------------------+--------------------+---------------------+\n",
            "\n"
          ]
        }
      ]
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "pyspark_paddding.ipynb",
	"provenance": []
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"# Functions to add padding to a column in #PySpark ✍️\n",
	"\n",
	"𝐥𝐩𝐚𝐝() -\n",
	"====\n",
	"👉In order to add padding to the left side of the column we use 𝐥𝐩𝐚𝐝() function.\n",
	"\n",
	"👉This function takes column name ,length and padding string as arguments.\n",
	"\n",
	"𝐫𝐩𝐚𝐝() -\n",
	"=====\n",
	"👉To add padding to the right side of the column we use 𝐫𝐩𝐚𝐝() function.\n",
	"\n",
	"👉This function also takes column name ,length and padding string as arguments. Same as lpad.\n",
	"\n",
	"==============================\n",
	"\n",
	"𝐋𝐞𝐟𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐥𝐩𝐚𝐝()\n",
	"\n",
	"𝐑𝐢𝐠𝐡𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐫𝐩𝐚𝐝()\n",
	"\n",
	"==============================\n",
	"\n",
	"\n",
	"Note:-\n",
	"====\n",
	"⚠️ If column value is longer than specified length, the return value will be shortened to length characters or bytes.\n",
	"\n",
	"⚠️ If padding value is not specified, then column value will be padded to the left or right depending on the function you are using, with space characters if it is a character string, and with zeros if it is a byte sequence.\n",
	"\n",
	"\n",
	"\n",
	"Follow for more:- \n",
	"https://www.linkedin.com/in/kishanyadav/\n",
	"\n",
	"############...... H@@py Learning!!! .......##########"
	],
	"metadata": {
	"id": "55i2WzI53ceu"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"# install pyspark in google colab\n",
	"!pip install pyspark"
	],
	"metadata": {
	"id": "eq0lZ5biTCFe"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# importing neccessary libs\n",
	"from pyspark.sql import SparkSession\n",
	"from pyspark.sql.functions import col, lpad, rpad\n",
	"\n",
	"# creating session\n",
	"spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n",
	"\n",
	"# creating dataframe\n",
	"data = [(\"Delhi\",30000),(\"Mumbai\",50000),(\"Gujrat\",80000)]\n",
	"columns= [\"state_name\",\"state_population\"]\n",
	"df_states = spark.createDataFrame(data = data, schema = columns)\n",
	"df_states.show()\n",
	"\n",
	"# left padding\n",
	"df_states = df_states.withColumn('states_name_leftpad', lpad(col(\"state_name\"), 10, '#'))\n",
	"df_states.show(truncate =False)\n",
	"\n",
	"# right padding\n",
	"df_states = df_states.withColumn('states_name_rightpad', rpad(col(\"state_name\"), 10, '#'))\n",
	"df_states.show(truncate =False)\n",
	"\n",
	"# when the column string is longer than the padded string length \n",
	"df_states = df_states.withColumn('states_name_condition', lpad(col(\"state_name\"), 3, '#'))\n",
	"df_states.show(truncate =False)\n"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "6pSMJtnR2aJF",
	"outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7"
	},
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"+----------+----------------+\n",
	"\|state_name\|state_population\|\n",
	"+----------+----------------+\n",
	"\| Delhi\| 30000\|\n",
	"\| Mumbai\| 50000\|\n",
	"\| Gujrat\| 80000\|\n",
	"+----------+----------------+\n",
	"\n",
	"+----------+----------------+-------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|\n",
	"+----------+----------------+-------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|\n",
	"+----------+----------------+-------------------+\n",
	"\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|states_name_rightpad\|\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|Delhi##### \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|Mumbai#### \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|Gujrat#### \|\n",
	"+----------+----------------+-------------------+--------------------+\n",
	"\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\|state_name\|state_population\|states_name_leftpad\|states_name_rightpad\|states_name_condition\|\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\|Delhi \|30000 \|#####Delhi \|Delhi##### \|Del \|\n",
	"\|Mumbai \|50000 \|####Mumbai \|Mumbai#### \|Mum \|\n",
	"\|Gujrat \|80000 \|####Gujrat \|Gujrat#### \|Guj \|\n",
	"+----------+----------------+-------------------+--------------------+---------------------+\n",
	"\n"
	]
	}
	]
	}
	]
	}