isteves/tidyverse2pyspark.md

## tidyverse2pyspark.md

      
    Raw
  

              tidyverse2pyspark.md
            
          
    Tidyverse to pyspark translations

Adding count of a column as a new column

df %>% add_count(some_col)
df.withColumn("n", count("*").over(Window.partitionBy("some_col")))
Add count and proportion as columns

df %>% count(x) %>% mutate(prop = n / sum(n))
df.groupBy("x").count().withColumn("prop", col("count")/sum("count").over(Window.partitionBy())).display()
Identify columns with high null rate:

max_null_prop <- 0.5

df %>% 
  summarize_all(~sum(is.na(.x)) / n()) %>% 
  gather(field, null_prop) %>% 
  filter(null_prop > max_null_prop)
max_null_prop = 0.5

null_prop_cond = [(sum(when(col(c).isNull(), 1).otherwise(0)) / count("*")).alias(c) for c in df.columns]

null_props = df\
.agg(*null_prop_cond)\
.collect()

high_null_cols = [col_name for col_name, col_value in null_props[0].asDict().items() if col_value > max_null_prop]
high_null_cols|