CsBigDataHub/ColumnConversion.py

## ColumnConversion.py
df = sc.parallelize([(1, 'Y','F',"Giri",'Y'), (2, 'N','V',"Databricks",'N'),(3,'Y','B',"SparkEdge",'Y'),(4,'N','X',"Spark",'N')]).toDF(["id", "flag1","flag2","name","flag3"])
print 'Show Dataframe'
df.show()
print 'Actual Schema of the df'
df.printSchema()

for a_dftype in df.dtypes:
    col_name = a_dftype[0]
    col_type = a_dftype[1]
#     print df.select(col_name).collect()[0][0]

    if col_type=='string' and (df.select(col_name).distinct().collect()[0][0] =='N' or df.select(col_name).distinct().collect()[0][0] =='Y'):
      df = df.withColumn(col_name,df[col_name].cast("boolean")).drop(df[col_name])
    else:
      df = df.withColumn(col_name,df[col_name]).drop(df[col_name])
print 'df with True/False Value after Data Type changes'
df.show()
print 'Modified Schema of the df'
df.printSchema()
	df = sc.parallelize([(1, 'Y','F',"Giri",'Y'), (2, 'N','V',"Databricks",'N'),(3,'Y','B',"SparkEdge",'Y'),(4,'N','X',"Spark",'N')]).toDF(["id", "flag1","flag2","name","flag3"])
	print 'Show Dataframe'
	df.show()
	print 'Actual Schema of the df'
	df.printSchema()

	for a_dftype in df.dtypes:
	col_name = a_dftype[0]
	col_type = a_dftype[1]
	# print df.select(col_name).collect()[0][0]

	if col_type=='string' and (df.select(col_name).distinct().collect()[0][0] =='N' or df.select(col_name).distinct().collect()[0][0] =='Y'):
	df = df.withColumn(col_name,df[col_name].cast("boolean")).drop(df[col_name])
	else:
	df = df.withColumn(col_name,df[col_name]).drop(df[col_name])
	print 'df with True/False Value after Data Type changes'
	df.show()
	print 'Modified Schema of the df'
	df.printSchema()