Last active
June 11, 2021 17:13
-
-
Save AdroitAnandAI/c4e0fa13d8fefa9936994e20b733370f to your computer and use it in GitHub Desktop.
To read and write from S3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from pyspark.sql import SparkSession | |
spark = SparkSession.builder.appName("docker-numpy").getOrCreate() | |
sc = spark.sparkContext | |
# Read in a file from S3 with the s3a file protocol | |
text = spark.read.text("s3a://spark-anand/main.py") | |
print(text.take(5)) | |
sentences = np.array([ | |
['This is a test program to read data.'], | |
['This data will be written to S3'], | |
['To write and read from S3'], | |
['This can be used to write'], | |
['coalesce to collect'] | |
]) | |
sentences_pandas = pd.DataFrame(sentences, columns=["text"]) | |
doc = spark.createDataFrame(sentences_pandas, ["text"]) | |
# Write to a file from S3 | |
doc.coalesce(1).write.mode("append").format("com.databricks.spark.csv")\\ | |
.option("header", "true").save("s3://spark-anand/output-files/fileout.txt") | |
# Call stop() otherwise the cluster will keep running | |
spark.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment