RachidAZ/DF_with_schema.py

## DF_with_schema.py
# set the schema with the following json structure
jsonStringFromFile="""
{
  "type": "struct",
  "fields": [
    {
      "name": "id",
      "type": "integer",
      "nullable": true,
      "metadata": {}
    },
    {
      "name": "name",
      "type": "string",
      "nullable": true,
      "metadata": {}
    }
    ]}
    """

import json
from pyspark.sql.types import *
dict = json.loads(jsonStringFromFile)
new_schema = StructType.fromJson(dict)

factDF_curated = spark.read.format("csv").load(filePath, schema=new_schema,  header = True)

# ------------------ below how to save the schema of a DF in the DBFS:

df=spark.sql("select * from {table_name} limit 1")

import json
from pyspark.sql.types import *

# Write the schema
with open("schema_file.json", "w") as f:
    json.dump(df.schema.jsonValue(), f)

# Read the schema
with open("schema_file.json") as f:
    print(json.dumps(json.load(f)))
	# set the schema with the following json structure
	jsonStringFromFile="""
	{
	"type": "struct",
	"fields": [
	{
	"name": "id",
	"type": "integer",
	"nullable": true,
	"metadata": {}
	},
	{
	"name": "name",
	"type": "string",
	"nullable": true,
	"metadata": {}
	}
	]}
	"""

	import json
	from pyspark.sql.types import *
	dict = json.loads(jsonStringFromFile)
	new_schema = StructType.fromJson(dict)

	factDF_curated = spark.read.format("csv").load(filePath, schema=new_schema, header = True)

	# ------------------ below how to save the schema of a DF in the DBFS:

	df=spark.sql("select * from {table_name} limit 1")

	import json
	from pyspark.sql.types import *

	# Write the schema
	with open("schema_file.json", "w") as f:
	json.dump(df.schema.jsonValue(), f)

	# Read the schema
	with open("schema_file.json") as f:
	print(json.dumps(json.load(f)))