Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@lakshay-arora
Last active November 8, 2019 06:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lakshay-arora/ca3633e9894082683435d51e17bca132 to your computer and use it in GitHub Desktop.
Save lakshay-arora/ca3633e9894082683435d51e17bca132 to your computer and use it in GitHub Desktop.
import pyspark.sql.types as tp
# define the schema
my_schema = tp.StructType([
tp.StructField(name= 'Batsman', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'Batsman_Name', dataType= tp.StringType(), nullable= True),
tp.StructField(name= 'Bowler', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'Bowler_Name', dataType= tp.StringType(), nullable= True),
tp.StructField(name= 'Commentary', dataType= tp.StringType(), nullable= True),
tp.StructField(name= 'Detail', dataType= tp.StringType(), nullable= True),
tp.StructField(name= 'Dismissed', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'Id', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'Isball', dataType= tp.BooleanType(), nullable= True),
tp.StructField(name= 'Isboundary', dataType= tp.BinaryType(), nullable= True),
tp.StructField(name= 'Iswicket', dataType= tp.BinaryType(), nullable= True),
tp.StructField(name= 'Over', dataType= tp.DoubleType(), nullable= True),
tp.StructField(name= 'Runs', dataType= tp.IntegerType(), nullable= True),
tp.StructField(name= 'Timestamp', dataType= tp.TimestampType(), nullable= True)
])
# read the data again with the defined schema
my_data = spark.read.csv('ind-ban-comment.csv',schema= my_schema,header= True)
# print the schema
my_data.printSchema()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment