Skip to content

Instantly share code, notes, and snippets.

@idiomer
idiomer / pyspark_hdfs_utils.py
Last active June 18, 2024 08:21
Using PySpark to handle HDFS, such as list (ls), rename (mv), delete (rm)
'''
The path is a directory by default
'''
def hdfs_list(path, subtract_one=True):
fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
list_status = fs.listStatus(spark._jvm.org.apache.hadoop.fs.Path(path))
# file.getPath().getName(), file.getBlockSize(), file.getLen()
files_size = [file.getLen() for file in list_status]
totol_size_in_MB = sum(files_size) / 1024.0 / 1024.0