karthikbgl/spark_dataframe_to_single_file_csv_using_hdfs.py

## spark_dataframe_to_single_file_csv_using_hdfs.py
import subprocess

def write_to_local_fs(df):
    """
    This method writes to local filesystem efficiently without using coalesce or repartition.
    The idea is to persist data in cluster format in hdfs (or whatever file storage) and write to local file system.
    Write header to the file in the local file system
    :param: df: the dataframe being sent as argument
    """

    hdfs_dir = "/path/to/some/valid_writeable/hdfs/directory"
    local_file = "csv_output.csv"

    df.write.mode("overwrite").format("com.databricks.spark.csv")\
        .options("header"="false", delimiter=",")\
        .save(hdfs_dir)
    #and whatever options
    subprocess.call("hdfs dfs -getmerge {} {}".format(hdfs_dir, local_file)) #This does not have header
    #Your logic to write header into the csv file on local_file
	import subprocess

	def write_to_local_fs(df):
	"""
	This method writes to local filesystem efficiently without using coalesce or repartition.
	The idea is to persist data in cluster format in hdfs (or whatever file storage) and write to local file system.
	Write header to the file in the local file system
	:param: df: the dataframe being sent as argument
	"""

	hdfs_dir = "/path/to/some/valid_writeable/hdfs/directory"
	local_file = "csv_output.csv"

	df.write.mode("overwrite").format("com.databricks.spark.csv")\
	.options("header"="false", delimiter=",")\
	.save(hdfs_dir)
	#and whatever options
	subprocess.call("hdfs dfs -getmerge {} {}".format(hdfs_dir, local_file)) #This does not have header
	#Your logic to write header into the csv file on local_file