Skip to content

Instantly share code, notes, and snippets.

@dfdeshom
Created September 15, 2017 17:32
Show Gist options
  • Save dfdeshom/0496d4b4f659fdab48a33f7ad9c4f3de to your computer and use it in GitHub Desktop.
Save dfdeshom/0496d4b4f659fdab48a33f7ad9c4f3de to your computer and use it in GitHub Desktop.
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
import boto3
def main(sc):
"""Write a lot of small files to S3 """
def write_s3(items):
# will be re-used by `write_data`
s3 = boto3.resource('s3')
def write_data(doc):
name = "%s" % (doc['name'],)
s3.Object("mybucket",name).put(Body=doc['text'])
return True
for doc in items:
yield write_data(doc)
data = load_text_data()
# Force full evaluation with collect() so we execute all writes to S3
data.mapPartitions(write_s3).collect()
if __name__ == '__main__':
conf = SparkConf()
with SparkContext(conf=conf) as sc:
main(sc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment