Tidyverse to pyspark translations Adding count of a column as a new column df %>% add_count(some_col) df.withColumn("n", count("*").over(Window.partitionBy("some_col"))) Add count and proportion as columns df %>% count(x) %>% mutate(prop = n / sum(n)) df.groupBy("x").count().withColumn("prop", col("count")/sum("count").over(Window.partitionBy())).display() Identify columns with high null rate: max_null_prop <- 0.5 df %>% summarize_all(~sum(is.na(.x)) / n()) %>% gather(field, null_prop) %>% filter(null_prop > max_null_prop) max_null_prop = 0.5 null_prop_cond = [(sum(when(col(c).isNull(), 1).otherwise(0)) / count("*")).alias(c) for c in df.columns] null_props = df\ .agg(*null_prop_cond)\ .collect() high_null_cols = [col_name for col_name, col_value in null_props[0].asDict().items() if col_value > max_null_prop] high_null_cols|