mkaranasou/pyspark_index_with_row_num_sortable_data.py

## pyspark_index_with_row_num_sortable_data.py
>>> from pyspark.sql import Window
# the window is necessary here because row_number is a windowing function
# that means you can have row_number run over some amount of your data
# we'll be currently running it over the sorted by column1 data, row per row - our window will be of size 2 (rows),
# the whole dataframe that is.
>>> window = Window.orderBy(F.col('column1'))
>>> df_final = df_final.withColumn('row_number', F.row_number().over(window)
>>> df_final.select('index', 'row_number', 'column1', 'column2').show()

+-----+----------+-------+-------+
|index|row_number|column1|column2|
+-----+----------+-------+-------+
|    0|         1|      1|      2|
|    1|         2|     15|     21|
+-----+----------+-------+-------+
	>>> from pyspark.sql import Window
	# the window is necessary here because row_number is a windowing function
	# that means you can have row_number run over some amount of your data
	# we'll be currently running it over the sorted by column1 data, row per row - our window will be of size 2 (rows),
	# the whole dataframe that is.
	>>> window = Window.orderBy(F.col('column1'))
	>>> df_final = df_final.withColumn('row_number', F.row_number().over(window)
	>>> df_final.select('index', 'row_number', 'column1', 'column2').show()

	+-----+----------+-------+-------+
	\|index\|row_number\|column1\|column2\|
	+-----+----------+-------+-------+
	\| 0\| 1\| 1\| 2\|
	\| 1\| 2\| 15\| 21\|
	+-----+----------+-------+-------+