Skip to content

Instantly share code, notes, and snippets.

@chrismay
Created April 12, 2023 06:55
Show Gist options
  • Save chrismay/113b75b5a820fa770975ff9820e2ba72 to your computer and use it in GitHub Desktop.
Save chrismay/113b75b5a820fa770975ff9820e2ba72 to your computer and use it in GitHub Desktop.
anagrams kata (pyspark)
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, udf, collect_list, size, length
spark = SparkSession.builder.getOrCreate()
def alpha_sort(word):
return "".join(sorted(word))
sortedUdf = udf(alpha_sort, StringType())
df = (
spark.read.csv("wordlist.txt")
.withColumnRenamed("_c0", "word")
.withColumn("sorted_word", sortedUdf(col("word")))
)
anagram_df = (
df.groupBy("sorted_word")
.agg(collect_list("word").alias("anagrams"))
.where(size(col("anagrams")) > 1)
)
print(anagram_df.count())
print(anagram_df.orderBy(size(col("anagrams")).desc()).first())
print(anagram_df.orderBy(length(col("sorted_word")).desc()).first(10))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment