AdroitAnandAI/readS3.py

## readS3.py
import pandas as pd
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("docker-numpy").getOrCreate()
sc = spark.sparkContext

# Read in a file from S3 with the s3a file protocol
text = spark.read.text("s3a://spark-anand/main.py")
print(text.take(5))

sentences = np.array([
         ['This is a test program to read data.'],
         ['This data will be written to S3'],
         ['To write and read from S3'],
         ['This can be used to write'],
         ['coalesce to collect']
])
sentences_pandas = pd.DataFrame(sentences, columns=["text"])
doc = spark.createDataFrame(sentences_pandas, ["text"])

# Write to a file from S3
doc.coalesce(1).write.mode("append").format("com.databricks.spark.csv")\\
  .option("header", "true").save("s3://spark-anand/output-files/fileout.txt")

# Call stop() otherwise the cluster will keep running
spark.stop()
	import pandas as pd
	from pyspark.sql import SparkSession
	spark = SparkSession.builder.appName("docker-numpy").getOrCreate()
	sc = spark.sparkContext

	# Read in a file from S3 with the s3a file protocol
	text = spark.read.text("s3a://spark-anand/main.py")
	print(text.take(5))

	sentences = np.array([
	['This is a test program to read data.'],
	['This data will be written to S3'],
	['To write and read from S3'],
	['This can be used to write'],
	['coalesce to collect']
	])
	sentences_pandas = pd.DataFrame(sentences, columns=["text"])
	doc = spark.createDataFrame(sentences_pandas, ["text"])

	# Write to a file from S3
	doc.coalesce(1).write.mode("append").format("com.databricks.spark.csv")\\
	.option("header", "true").save("s3://spark-anand/output-files/fileout.txt")

	# Call stop() otherwise the cluster will keep running
	spark.stop()