Kovid Rathee kovid-r

## pyspark_cheatsheet_between.py
# Filter movies with avg_ratings > 7.5 and < 8.2
df.filter((F.col('avg_ratings') > 7.5) & (F.col('avg_ratings') < 8.2)).show()

# Another way to do this
df.filter(df.avg_ratings.between(7.5,8.2)).show()

## pyspark_cheatsheet_between.py
# Filter movies with avg_ratings > 7.5 and < 8.2
df.filter((F.col('avg_ratings') > 7.5) & (F.col('avg_ratings') < 8.2)).show()

# Another way to do this
df.filter(df.avg_ratings.between(7.5,8.2)).show()

## pyspark_cheatsheet_read_using_schema.py
rdd = spark.textFile(csv_file_path)

from pyspark.sql.types import StringType, StructField, StructType, IntegerType
schema = StructType([
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("age", IntegerType(), True)
    ])

df = spark.createDataFrame(rdd, schema)

## pyspark_cheatsheet_handling_nulls_add_new_column.py
# When a new column is supposed to have nulls
df = df.withColumn('new_col_1', F.lit(None).cast(StringType()))

# When a new column is supposed to have 0 as the default value
df = df.withColumn('new_col_2', F.lit(0)))

# When a new column is supposed to be derived from two (or more) existing columns
df = df.withColumn('new_col_3', F.lit(df.some_column/df.some_other_column)))

## pyspark_cheatsheet_isnull.py
# Find all the films for which budget information is not available
df.where(df.budget.isNull()).show()

# Similarly, find all the films for which budget information is available
df.where(df.budget.isNotNull()).show()

## pyspark_cheatsheet_aggregates.py
# Year wise summary of a selected portion of the dataset
df.groupBy('year')\
          .agg(F.min('budget').alias('min_budget'),\
               F.max('budget').alias('max_budget'),\
               F.sum('revenue').alias('total_revenue'),\
               F.avg('revenue').alias('avg_revenue'),\
               F.mean('revenue').alias('mean_revenue'),\
              )\
          .sort(F.col('year').desc())\
          .show()

## pyspark_cheatsheet_windows_and_sorting.py
from pyspark.sql import Window

# Rank all the films by revenue in the default ascending order
df.select("title", "year", F.rank().over(Window.orderBy("revenue")).alias("revenue_rank")).show()

# Rank year-wise films by revenue in the descending order
df.select("title", "year", F.rank().over(Window.partitionBy("year").orderBy("revenue").desc()).alias("revenue_rank")).show()

## pyspark_cheatsheet_sort_orderby.py
df.filter(df.year != '1998').sort(F.asc('year'))
df.filter(df.year != '1998').sort(F.desc('year'))
df.filter(df.year != '1998').sort(F.col('year').desc())
df.filter(df.year != '1998').sort(F.col('year').asc())

df.filter(df.year != '1998').orderBy(F.asc('year'))
df.filter(df.year != '1998').orderBy(F.desc('year'))
df.filter(df.year != '1998').orderBy(F.col('year').desc())
df.filter(df.year != '1998').orderBy(F.col('year').asc())

## pyspark_cheatsheet_joins.py
# Joining two DataFrames
df1.join(df2, 'title', 'full')

# Another way to join DataFrames
df1.join(df2, 'title', how='left')

# Cross join when you don't specify a key
df1.join(df2)

# Another way to join

## get-medium-stats.js
const totalTypes = {
  VIEWS: 2,
  READS: 3,
  FANS: 5
};

const getTotal = tableColumn =>
  [
    ...document.querySelectorAll(
      `td:nth-child(${tableColumn}) > span.sortableTable-number`
	# Filter movies with avg_ratings > 7.5 and < 8.2
	df.filter((F.col('avg_ratings') > 7.5) & (F.col('avg_ratings') < 8.2)).show()

	# Another way to do this
	df.filter(df.avg_ratings.between(7.5,8.2)).show()
	rdd = spark.textFile(csv_file_path)

	from pyspark.sql.types import StringType, StructField, StructType, IntegerType
	schema = StructType([
	StructField("first_name", StringType(), True),
	StructField("last_name", StringType(), True),
	StructField("age", IntegerType(), True)
	])

	df = spark.createDataFrame(rdd, schema)
	# When a new column is supposed to have nulls
	df = df.withColumn('new_col_1', F.lit(None).cast(StringType()))

	# When a new column is supposed to have 0 as the default value
	df = df.withColumn('new_col_2', F.lit(0)))

	# When a new column is supposed to be derived from two (or more) existing columns
	df = df.withColumn('new_col_3', F.lit(df.some_column/df.some_other_column)))
	# Find all the films for which budget information is not available
	df.where(df.budget.isNull()).show()

	# Similarly, find all the films for which budget information is available
	df.where(df.budget.isNotNull()).show()
	# Year wise summary of a selected portion of the dataset
	df.groupBy('year')\
	.agg(F.min('budget').alias('min_budget'),\
	F.max('budget').alias('max_budget'),\
	F.sum('revenue').alias('total_revenue'),\
	F.avg('revenue').alias('avg_revenue'),\
	F.mean('revenue').alias('mean_revenue'),\
	)\
	.sort(F.col('year').desc())\
	.show()
	from pyspark.sql import Window

	# Rank all the films by revenue in the default ascending order
	df.select("title", "year", F.rank().over(Window.orderBy("revenue")).alias("revenue_rank")).show()

	# Rank year-wise films by revenue in the descending order
	df.select("title", "year", F.rank().over(Window.partitionBy("year").orderBy("revenue").desc()).alias("revenue_rank")).show()
	df.filter(df.year != '1998').sort(F.asc('year'))
	df.filter(df.year != '1998').sort(F.desc('year'))
	df.filter(df.year != '1998').sort(F.col('year').desc())
	df.filter(df.year != '1998').sort(F.col('year').asc())

	df.filter(df.year != '1998').orderBy(F.asc('year'))
	df.filter(df.year != '1998').orderBy(F.desc('year'))
	df.filter(df.year != '1998').orderBy(F.col('year').desc())
	df.filter(df.year != '1998').orderBy(F.col('year').asc())
	# Joining two DataFrames
	df1.join(df2, 'title', 'full')

	# Another way to join DataFrames
	df1.join(df2, 'title', how='left')

	# Cross join when you don't specify a key
	df1.join(df2)

	# Another way to join
	const totalTypes = {
	VIEWS: 2,
	READS: 3,
	FANS: 5
	};

	const getTotal = tableColumn =>
	[
	...document.querySelectorAll(
	`td:nth-child(${tableColumn}) > span.sortableTable-number`