Created
August 19, 2022 06:55
-
-
Save kishanpython/1b81511219905bfcdc2fa7b3410b498d to your computer and use it in GitHub Desktop.
Functions to add padding to a column in #PySpark ✍️
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "pyspark_paddding.ipynb", | |
"provenance": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"# Functions to add padding to a column in #PySpark ✍️\n", | |
"\n", | |
"𝐥𝐩𝐚𝐝() -\n", | |
"====\n", | |
"👉In order to add padding to the left side of the column we use 𝐥𝐩𝐚𝐝() function.\n", | |
"\n", | |
"👉This function takes column name ,length and padding string as arguments.\n", | |
"\n", | |
"𝐫𝐩𝐚𝐝() -\n", | |
"=====\n", | |
"👉To add padding to the right side of the column we use 𝐫𝐩𝐚𝐝() function.\n", | |
"\n", | |
"👉This function also takes column name ,length and padding string as arguments. Same as lpad.\n", | |
"\n", | |
"==============================\n", | |
"\n", | |
"𝐋𝐞𝐟𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐥𝐩𝐚𝐝()\n", | |
"\n", | |
"𝐑𝐢𝐠𝐡𝐭 𝐩𝐚𝐝 𝐨𝐟 𝐭𝐡𝐞 𝐜𝐨𝐥𝐮𝐦𝐧 𝐢𝐧 𝐩𝐲𝐬𝐩𝐚𝐫𝐤 – 𝐫𝐩𝐚𝐝()\n", | |
"\n", | |
"==============================\n", | |
"\n", | |
"\n", | |
"Note:-\n", | |
"====\n", | |
"⚠️ If column value is longer than specified length, the return value will be shortened to length characters or bytes.\n", | |
"\n", | |
"⚠️ If padding value is not specified, then column value will be padded to the left or right depending on the function you are using, with space characters if it is a character string, and with zeros if it is a byte sequence.\n", | |
"\n", | |
"\n", | |
"\n", | |
"Follow for more:- \n", | |
"https://www.linkedin.com/in/kishanyadav/\n", | |
"\n", | |
"############...... H@@py Learning!!! .......##########" | |
], | |
"metadata": { | |
"id": "55i2WzI53ceu" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# install pyspark in google colab\n", | |
"!pip install pyspark" | |
], | |
"metadata": { | |
"id": "eq0lZ5biTCFe" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"# importing neccessary libs\n", | |
"from pyspark.sql import SparkSession\n", | |
"from pyspark.sql.functions import col, lpad, rpad\n", | |
"\n", | |
"# creating session\n", | |
"spark = SparkSession.builder.appName(\"practice\").getOrCreate()\n", | |
"\n", | |
"# creating dataframe\n", | |
"data = [(\"Delhi\",30000),(\"Mumbai\",50000),(\"Gujrat\",80000)]\n", | |
"columns= [\"state_name\",\"state_population\"]\n", | |
"df_states = spark.createDataFrame(data = data, schema = columns)\n", | |
"df_states.show()\n", | |
"\n", | |
"# left padding\n", | |
"df_states = df_states.withColumn('states_name_leftpad', lpad(col(\"state_name\"), 10, '#'))\n", | |
"df_states.show(truncate =False)\n", | |
"\n", | |
"# right padding\n", | |
"df_states = df_states.withColumn('states_name_rightpad', rpad(col(\"state_name\"), 10, '#'))\n", | |
"df_states.show(truncate =False)\n", | |
"\n", | |
"# when the column string is longer than the padded string length \n", | |
"df_states = df_states.withColumn('states_name_condition', lpad(col(\"state_name\"), 3, '#'))\n", | |
"df_states.show(truncate =False)\n" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "6pSMJtnR2aJF", | |
"outputId": "3f875bde-d216-4883-9e87-f3137ecf5da7" | |
}, | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"+----------+----------------+\n", | |
"|state_name|state_population|\n", | |
"+----------+----------------+\n", | |
"| Delhi| 30000|\n", | |
"| Mumbai| 50000|\n", | |
"| Gujrat| 80000|\n", | |
"+----------+----------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+\n", | |
"|state_name|state_population|states_name_leftpad|\n", | |
"+----------+----------------+-------------------+\n", | |
"|Delhi |30000 |#####Delhi |\n", | |
"|Mumbai |50000 |####Mumbai |\n", | |
"|Gujrat |80000 |####Gujrat |\n", | |
"+----------+----------------+-------------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"|state_name|state_population|states_name_leftpad|states_name_rightpad|\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"|Delhi |30000 |#####Delhi |Delhi##### |\n", | |
"|Mumbai |50000 |####Mumbai |Mumbai#### |\n", | |
"|Gujrat |80000 |####Gujrat |Gujrat#### |\n", | |
"+----------+----------------+-------------------+--------------------+\n", | |
"\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"|state_name|state_population|states_name_leftpad|states_name_rightpad|states_name_condition|\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"|Delhi |30000 |#####Delhi |Delhi##### |Del |\n", | |
"|Mumbai |50000 |####Mumbai |Mumbai#### |Mum |\n", | |
"|Gujrat |80000 |####Gujrat |Gujrat#### |Guj |\n", | |
"+----------+----------------+-------------------+--------------------+---------------------+\n", | |
"\n" | |
] | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment