Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
from concurrent.futures import ThreadPoolExecutor, as_completed
from pyspark.sql import SparkSession
import sys
if __name__ == "__main__":
hive_table_name = sys.argv[1]
out_path = sys.argv[2]
# setup and surpress create _$folder$ object to S3
ss = SparkSession.builder \
.appName("Convert2Parquet") \
.enableHiveSupport() \
.getOrCreate()
ss._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.algorithm.version", "2")
df = ss.table(hive_table_name)
df.write.mode("append").parquet(out_path, compression="snappy")
ss.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment