Pyspark program that interacts with Azure Data Lake Storage Gen 2 using HDFS API. Delete and check operations are demonstrated in this program. You can modify the same to perform all the other file system operations. For more details, refer to https://amalgjose.com
from pyspark.sql import SparkSession | |
# Author: Amal G Jose | |
# Reference: https://amalgjose.com | |
# prepare spark session | |
spark = SparkSession.builder.appName('filesystemoperations').getOrCreate() | |
# spark context | |
sc = spark.sparkContext | |
# set ADLS file system URI | |
sc._jsc.hadoopConfiguration().set('fs.defaultFS', 'abfs://CONTAINER@ACCOUNTNAME.dfs.core.windows.net/') | |
# FileSystem manager | |
fs = (sc._jvm.org | |
.apache.hadoop | |
.fs.FileSystem | |
.get(sc._jsc.hadoopConfiguration()) | |
) | |
# Enter the ADLS path | |
path = "Your/adls/path" | |
# Delete the file or directory in ADLS using the below command | |
deletion_status = fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True) | |
print("Deletion status -->", deletion_status) | |
# check whether the file or directory got deleted. This will return True if exists and False if does not | |
status = fs.exists(sc._jvm.org.apache.hadoop.fs.Path(path)) | |
print("Status -->", status) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment