Generate a unique identifier that consistently produces the same result each time based on the values in the row. The ID column will be the first column positioned in the DataFrame.
from pyspark.sql.functions import sha2, concat_ws
columns = df.columns
df = df.withColumn(id_col, sha2(concat_ws("||", *df.columns), 256))
df = df.select([id_col] + columns)
df.printSchema()
Assert the ID column generated is consistent each time
df.select(id_col).orderBy(id_col).limit(5).show(truncate=False)
df2 = df.drop(id_col)
df2 = df2.withColumn(id_col, sha2(concat_ws("||", *df2.columns), 256))
df2 = df2.select([id_col] + columns)
df2.select(id_col).orderBy(id_col).limit(5).show(truncate=False)
diff_rows = df.subtract(df2).count()
print("Number of rows that are different: {}".format(diff_rows))
assert diff_rows == 0
df2.unpersist()
Add a monotonically increasing identifier column. The ID column will be the first column positioned in the DataFrame.
from pyspark.sql.functions import monotonically_increasing_id
columns = df.columns
df = df.withColumn(id_col, monotonically_increasing_id())
df = df.select([id_col] + columns)
df.show()
df.printSchema()