Last active
May 13, 2025 15:29
-
-
Save edgl/30558cdb228aac60ad27e970014f9bd8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
BEELINE="" | |
OUTPUT_FILE="hive_table_hdfs_stats.csv" | |
echo "Database,Table,HDFS_Location,Size_Bytes,File_Count" > "$OUTPUT_FILE" | |
get_hdfs_size() { | |
local path="$1" | |
hdfs dfs -du -s "$path" 2>/dev/null | awk '{print $1}' || echo "0" | |
} | |
# Function to get HDFS file count | |
get_hdfs_file_count() { | |
local path="$1" | |
hdfs dfs -ls -R "$path" 2>/dev/null | grep -v '^d' | wc -l || echo "0" | |
} | |
run_beeline_query() { | |
local query="$1" | |
$BEELINE --silent=true --outputformat=csv2 -e "$query" 2>/dev/null | |
} | |
databases=$(run_beeline_query "SHOW DATABASES;" | tail -n +2) | |
for db in $databases; do | |
echo "Processing database: $db" | |
# Switch to database | |
run_beeline_query "USE \`$db\`;" > /dev/null | |
# Get all tables | |
tables=$(run_beeline_query "SHOW TABLES;" | tail -n +2) | |
# Iterate through each table | |
for table in $tables; do | |
echo " Processing table: $table" | |
location=$(run_beeline_query "DESCRIBE FORMATTED \`$table\`;" | | |
grep "Location" | | |
awk -F',' '{print $3}' | | |
sed 's/^[[:space:]]*//;s/[[:space:]]*$//') | |
if [ -z "$location" ] || [ "$location" == "NULL" ]; then | |
echo " Skipping: No HDFS location found for $db.$table" | |
echo "$db,$table,,0,0" >> "$OUTPUT_FILE" | |
continue | |
fi | |
size=$(get_hdfs_size "$location") | |
file_count=$(get_hdfs_file_count "$location") | |
echo "$db,$table,$location,$size,$file_count" >> "$OUTPUT_FILE" | |
done | |
done | |
echo "Analysis complete. Results saved to $OUTPUT_FILE" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment