Created
September 23, 2020 06:34
-
-
Save amalgjose/a29641900fca846a39a838edcc08d3ff to your computer and use it in GitHub Desktop.
Pyspark program that interacts with Azure Data Lake Storage Gen 2 using HDFS API. Delete and check operations are demonstrated in this program. You can modify the same to perform all the other file system operations. For more details, refer to https://amalgjose.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
# Author: Amal G Jose | |
# Reference: https://amalgjose.com | |
# prepare spark session | |
spark = SparkSession.builder.appName('filesystemoperations').getOrCreate() | |
# spark context | |
sc = spark.sparkContext | |
# set ADLS file system URI | |
sc._jsc.hadoopConfiguration().set('fs.defaultFS', 'abfs://CONTAINER@ACCOUNTNAME.dfs.core.windows.net/') | |
# FileSystem manager | |
fs = (sc._jvm.org | |
.apache.hadoop | |
.fs.FileSystem | |
.get(sc._jsc.hadoopConfiguration()) | |
) | |
# Enter the ADLS path | |
path = "Your/adls/path" | |
# Delete the file or directory in ADLS using the below command | |
deletion_status = fs.delete(sc._jvm.org.apache.hadoop.fs.Path(path), True) | |
print("Deletion status -->", deletion_status) | |
# check whether the file or directory got deleted. This will return True if exists and False if does not | |
status = fs.exists(sc._jvm.org.apache.hadoop.fs.Path(path)) | |
print("Status -->", status) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment