kovid-r/pyspark_cheatsheet_create_new_columnsdf.py

## pyspark_cheatsheet_create_new_columnsdf.py
# Create a column with the default value = 'xyz'
df = df.withColumn('new_column', F.lit('xyz'))

# Create a column with default value as null
df = df.withColumn('new_column', F.lit(None).cast(StringType()))

# Create a column using an existing column
df = df.withColumn('new_column', 1.4 * F.col('existing_column'))

# Another example using the MovieLens database
df = df.withColumn('test_col3', F.when(F.col('avg_ratings') < 7, 'OK')\
                                 .when(F.col('avg_ratings') < 8, 'Good')\
                                 .otherwise('Great')).show()

# Create a column using a UDF

def categorize(val):
  if val < 150:
    return 'bucket_1'
  else:
    return 'bucket_2'

my_udf = F.udf(categorize, StringType())

df = df.withColumn('new_column', categorize('existing_column'))
	# Create a column with the default value = 'xyz'
	df = df.withColumn('new_column', F.lit('xyz'))

	# Create a column with default value as null
	df = df.withColumn('new_column', F.lit(None).cast(StringType()))

	# Create a column using an existing column
	df = df.withColumn('new_column', 1.4 * F.col('existing_column'))

	# Another example using the MovieLens database
	df = df.withColumn('test_col3', F.when(F.col('avg_ratings') < 7, 'OK')\
	.when(F.col('avg_ratings') < 8, 'Good')\
	.otherwise('Great')).show()

	# Create a column using a UDF

	def categorize(val):
	if val < 150:
	return 'bucket_1'
	else:
	return 'bucket_2'

	my_udf = F.udf(categorize, StringType())

	df = df.withColumn('new_column', categorize('existing_column'))