dfdeshom/small_files.py Secret

## small_files.py
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
import boto3

def main(sc):
    """Write a lot of small files to S3 """

     def write_s3(items):
        # will be re-used by `write_data`
        s3 = boto3.resource('s3')

        def write_data(doc):
            name = "%s" % (doc['name'],)
            s3.Object("mybucket",name).put(Body=doc['text'])
            return True

        for doc in items:
            yield write_data(doc)

    data = load_text_data()
    # Force full evaluation with collect() so we execute all writes to S3
    data.mapPartitions(write_s3).collect()

if __name__ == '__main__':
    conf = SparkConf()
    with SparkContext(conf=conf) as sc:
        main(sc)
	from pyspark import SparkContext
	from pyspark.conf import SparkConf
	from pyspark.sql import SparkSession, SQLContext, Row
	import boto3

	def main(sc):
	"""Write a lot of small files to S3 """

	def write_s3(items):
	# will be re-used by `write_data`
	s3 = boto3.resource('s3')

	def write_data(doc):
	name = "%s" % (doc['name'],)
	s3.Object("mybucket",name).put(Body=doc['text'])
	return True

	for doc in items:
	yield write_data(doc)

	data = load_text_data()
	# Force full evaluation with collect() so we execute all writes to S3
	data.mapPartitions(write_s3).collect()

	if __name__ == '__main__':
	conf = SparkConf()
	with SparkContext(conf=conf) as sc:
	main(sc)