Skip to content

Instantly share code, notes, and snippets.

@tly1980
Last active April 24, 2018 01:32
Show Gist options
  • Save tly1980/c191cd027b7374bd137686b6a65d3169 to your computer and use it in GitHub Desktop.
Save tly1980/c191cd027b7374bd137686b6a65d3169 to your computer and use it in GitHub Desktop.
A pyspark script which just dump headers to stdout
from __future__ import print_function
import argparse
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
AP = argparse.ArgumentParser()
AP.add_argument('table')
AP.add_argument('--delimiter', default=',')
def main(args):
SparkContext._ensure_initialized()
try:
# Try to access HiveConf, it will raise exception if Hive is not added
SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf()
spark = SparkSession.builder\
.enableHiveSupport()\
.getOrCreate()
except py4j.protocol.Py4JError:
spark = SparkSession.builder.getOrCreate()
except TypeError:
spark = SparkSession.builder.getOrCreate()
df = spark.sql('describe {table}'.format(table=args.table))
headers = [c['col_name'] for c in df.collect()]
print(args.delimiter.join(headers))
if __name__ == '__main__':
main(AP.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment