Created
September 8, 2023 16:01
-
-
Save dineshdharme/5ef9c9250bbcf56a4febe2e384870bd4 to your computer and use it in GitHub Desktop.
A simple example of data wrangling.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Here's one way to do it. Basically I asked the Robs node to be made into ROW. Then onwards its normal data wrangling. | |
import sys | |
from pyspark import SparkContext, SQLContext | |
from pyspark.sql import functions as F | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder \ | |
.appName("MyApp") \ | |
.config("spark.jars", "file:/mnt/684C285C4C2826F2/opt/github/pyspark-example/spark-jars/spark-xml_2.12-0.16.0.jar") \ | |
.getOrCreate() | |
sc = spark.sparkContext | |
sqlContext = SQLContext(sc) | |
input_path_xml = "../data/application_job.xml" | |
df = sqlContext.read \ | |
.format("com.databricks.spark.xml") \ | |
.option("rowTag", "Robs") \ | |
.load(input_path_xml) | |
print("Show the dataframe") | |
df.show(n=100, truncate=False) | |
print("Schema here") | |
df.printSchema() | |
df = df.withColumnRenamed("_AsOfDate", "AsOfDate") | |
df = df.withColumn("col1", F.col("Rob._col1")) | |
df = df.withColumn("col2", F.col("Rob._col2")) | |
df = df.withColumn("Remaining", F.col("Rob._Remaining")) | |
df = df.withColumn("Allocation", F.col("Rob.Allocation")).drop("Rob") | |
print("Show the dataframe") | |
df.show(n=100, truncate=False) | |
print("Schema here") | |
df.printSchema() | |
df = df.withColumn("Allocation_Element", F.explode(F.col("Allocation"))).drop("Allocation") | |
print("Show the dataframe") | |
df.show(n=100, truncate=False) | |
df = df.withColumn("Allocation_Name", F.col("Allocation_Element._Name"))\ | |
.withColumn("Allocation_Quantity", F.col("Allocation_Element._Quantity"))\ | |
.drop("Allocation_Element") | |
print("Show the dataframe") | |
df.show(n=100, truncate=False) | |
Output : | |
Show the dataframe | |
+-------------------------------------------------------------------------------------------------------------+-------------------------+ | |
|Rob |_AsOfDate | | |
+-------------------------------------------------------------------------------------------------------------+-------------------------+ | |
|{[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}], 313.094, 5, TEST1}|2023-09-02T12:00:00-06:00| | |
|{[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}], 414.094, 6, TEST2}|2023-09-02T12:00:77-07:00| | |
|{[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}], 515.094, 7, TEST3}|2023-09-02T12:00:88-08:00| | |
+-------------------------------------------------------------------------------------------------------------+-------------------------+ | |
Schema here | |
root | |
|-- Rob: struct (nullable = true) | |
| |-- Allocation: array (nullable = true) | |
| | |-- element: struct (containsNull = true) | |
| | | |-- _Name: string (nullable = true) | |
| | | |-- _Quantity: double (nullable = true) | |
| | | |-- _VALUE: string (nullable = true) | |
| |-- _Remaining: double (nullable = true) | |
| |-- _col1: long (nullable = true) | |
| |-- _col2: string (nullable = true) | |
|-- _AsOfDate: string (nullable = true) | |
Show the dataframe | |
+-------------------------+----+-----+---------+----------------------------------------------------------------------------------------+ | |
|AsOfDate |col1|col2 |Remaining|Allocation | | |
+-------------------------+----+-----+---------+----------------------------------------------------------------------------------------+ | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}]| | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}]| | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |[{Test123, 8.5, null}, {Test456, 1.6, null}, {Test789, 0.4, null}, {Test135, 0.1, null}]| | |
+-------------------------+----+-----+---------+----------------------------------------------------------------------------------------+ | |
Schema here | |
root | |
|-- AsOfDate: string (nullable = true) | |
|-- col1: long (nullable = true) | |
|-- col2: string (nullable = true) | |
|-- Remaining: double (nullable = true) | |
|-- Allocation: array (nullable = true) | |
| |-- element: struct (containsNull = true) | |
| | |-- _Name: string (nullable = true) | |
| | |-- _Quantity: double (nullable = true) | |
| | |-- _VALUE: string (nullable = true) | |
Show the dataframe | |
+-------------------------+----+-----+---------+--------------------+ | |
|AsOfDate |col1|col2 |Remaining|Allocation_Element | | |
+-------------------------+----+-----+---------+--------------------+ | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |{Test123, 8.5, null}| | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |{Test456, 1.6, null}| | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |{Test789, 0.4, null}| | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |{Test135, 0.1, null}| | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |{Test123, 8.5, null}| | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |{Test456, 1.6, null}| | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |{Test789, 0.4, null}| | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |{Test135, 0.1, null}| | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |{Test123, 8.5, null}| | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |{Test456, 1.6, null}| | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |{Test789, 0.4, null}| | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |{Test135, 0.1, null}| | |
+-------------------------+----+-----+---------+--------------------+ | |
Show the dataframe | |
+-------------------------+----+-----+---------+---------------+-------------------+ | |
|AsOfDate |col1|col2 |Remaining|Allocation_Name|Allocation_Quantity| | |
+-------------------------+----+-----+---------+---------------+-------------------+ | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |Test123 |8.5 | | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |Test456 |1.6 | | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |Test789 |0.4 | | |
|2023-09-02T12:00:00-06:00|5 |TEST1|313.094 |Test135 |0.1 | | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |Test123 |8.5 | | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |Test456 |1.6 | | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |Test789 |0.4 | | |
|2023-09-02T12:00:77-07:00|6 |TEST2|414.094 |Test135 |0.1 | | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |Test123 |8.5 | | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |Test456 |1.6 | | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |Test789 |0.4 | | |
|2023-09-02T12:00:88-08:00|7 |TEST3|515.094 |Test135 |0.1 | | |
+-------------------------+----+-----+---------+---------------+-------------------+ | |
Input file for the above output: | |
<FormValues> | |
<FieldValues> | |
<FieldValue Id="TEST"> | |
<Robs AsOfDate="2023-09-02T12:00:00-06:00"> | |
<Rob col1="5" col2="TEST1" Remaining="313.094"> | |
<Allocation Name="Test123" Quantity="8.5"/> | |
<Allocation Name="Test456" Quantity="1.6"/> | |
<Allocation Name="Test789" Quantity="0.4"/> | |
<Allocation Name="Test135" Quantity="0.1"/> | |
</Rob> | |
</Robs> | |
</FieldValue> | |
</FieldValues> | |
</FormValues> | |
<FormValues> | |
<FieldValues> | |
<FieldValue Id="TEST"> | |
<Robs AsOfDate="2023-09-02T12:00:77-07:00"> | |
<Rob col1="6" col2="TEST2" Remaining="414.094"> | |
<Allocation Name="Test123" Quantity="8.5"/> | |
<Allocation Name="Test456" Quantity="1.6"/> | |
<Allocation Name="Test789" Quantity="0.4"/> | |
<Allocation Name="Test135" Quantity="0.1"/> | |
</Rob> | |
</Robs> | |
</FieldValue> | |
</FieldValues> | |
</FormValues> | |
<FormValues> | |
<FieldValues> | |
<FieldValue Id="TEST"> | |
<Robs AsOfDate="2023-09-02T12:00:88-08:00"> | |
<Rob col1="7" col2="TEST3" Remaining="515.094"> | |
<Allocation Name="Test123" Quantity="8.5"/> | |
<Allocation Name="Test456" Quantity="1.6"/> | |
<Allocation Name="Test789" Quantity="0.4"/> | |
<Allocation Name="Test135" Quantity="0.1"/> | |
</Rob> | |
</Robs> | |
</FieldValue> | |
</FieldValues> | |
</FormValues> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment