Skip to content

Instantly share code, notes, and snippets.

@edgl
Last active May 13, 2025 15:29
Show Gist options
  • Save edgl/30558cdb228aac60ad27e970014f9bd8 to your computer and use it in GitHub Desktop.
Save edgl/30558cdb228aac60ad27e970014f9bd8 to your computer and use it in GitHub Desktop.
#!/bin/bash
set -e
BEELINE=""
OUTPUT_FILE="hive_table_hdfs_stats.csv"
echo "Database,Table,HDFS_Location,Size_Bytes,File_Count" > "$OUTPUT_FILE"
get_hdfs_size() {
local path="$1"
hdfs dfs -du -s "$path" 2>/dev/null | awk '{print $1}' || echo "0"
}
# Function to get HDFS file count
get_hdfs_file_count() {
local path="$1"
hdfs dfs -ls -R "$path" 2>/dev/null | grep -v '^d' | wc -l || echo "0"
}
run_beeline_query() {
local query="$1"
$BEELINE --silent=true --outputformat=csv2 -e "$query" 2>/dev/null
}
databases=$(run_beeline_query "SHOW DATABASES;" | tail -n +2)
for db in $databases; do
echo "Processing database: $db"
# Switch to database
run_beeline_query "USE \`$db\`;" > /dev/null
# Get all tables
tables=$(run_beeline_query "SHOW TABLES;" | tail -n +2)
# Iterate through each table
for table in $tables; do
echo " Processing table: $table"
location=$(run_beeline_query "DESCRIBE FORMATTED \`$table\`;" |
grep "Location" |
awk -F',' '{print $3}' |
sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
if [ -z "$location" ] || [ "$location" == "NULL" ]; then
echo " Skipping: No HDFS location found for $db.$table"
echo "$db,$table,,0,0" >> "$OUTPUT_FILE"
continue
fi
size=$(get_hdfs_size "$location")
file_count=$(get_hdfs_file_count "$location")
echo "$db,$table,$location,$size,$file_count" >> "$OUTPUT_FILE"
done
done
echo "Analysis complete. Results saved to $OUTPUT_FILE"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment