Skip to content

Instantly share code, notes, and snippets.

@RachidAZ
Created January 16, 2022 22:52
Show Gist options
  • Save RachidAZ/ce2dfe10ce051ffee901e59a4041b1b8 to your computer and use it in GitHub Desktop.
Save RachidAZ/ce2dfe10ce051ffee901e59a4041b1b8 to your computer and use it in GitHub Desktop.
create DataFrame with defined schma in PySpark
# set the schema with the following json structure
jsonStringFromFile="""
{
"type": "struct",
"fields": [
{
"name": "id",
"type": "integer",
"nullable": true,
"metadata": {}
},
{
"name": "name",
"type": "string",
"nullable": true,
"metadata": {}
}
]}
"""
import json
from pyspark.sql.types import *
dict = json.loads(jsonStringFromFile)
new_schema = StructType.fromJson(dict)
factDF_curated = spark.read.format("csv").load(filePath, schema=new_schema, header = True)
# ------------------ below how to save the schema of a DF in the DBFS:
df=spark.sql("select * from {table_name} limit 1")
import json
from pyspark.sql.types import *
# Write the schema
with open("schema_file.json", "w") as f:
json.dump(df.schema.jsonValue(), f)
# Read the schema
with open("schema_file.json") as f:
print(json.dumps(json.load(f)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment