Last active
November 20, 2018 14:31
-
-
Save eric-maynard/b7d3222d1aa9f760ddd4de05fdda5d04 to your computer and use it in GitHub Desktop.
A script to analyze a large volume of Impala partitions with an Impala table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
################################################## | |
# Constants # | |
################################################## | |
delimiter='~' | |
soft_delimiter="," | |
column_limit=9 | |
################################################## | |
# Variables # | |
################################################## | |
#cluster specific: | |
alias imp='' | |
partitions_table='example.partition_analysis' | |
partitions_table_location='/example/partition_analysis' | |
partitions_table_view='example.partition_analysis_conformed' | |
#instance specific: | |
dbs=('db_one' 'db_two') | |
output_file='./eric.csv' | |
#safety values: | |
table_limit=2500 | |
partition_limit=50000 | |
################################################## | |
# Functions # | |
################################################## | |
#write to stdout with a timestamp | |
log () { | |
echo "[$(date +%s)] $1" | |
} | |
#write to an output file | |
write () { | |
echo "$1" >> ${output_file} | |
} | |
#split a string by its delimiter, reverse it, and recompose it with that delimiter | |
reverse_by_delimiter () { | |
echo "$1" | perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;' | |
} | |
#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters | |
trim_columns () { | |
echo "$1" | sed "s/${delimiter}/${soft_delimiter}/${column_limit}g" | |
} | |
################################################## | |
# Main script # | |
################################################## | |
#set up: | |
log "Setting up..." | |
rm -f ${output_file} 2> /dev/null | |
hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null | |
#loop through all impala tables in $db: | |
log "Collecting information on up to ${table_limit} tables..." | |
for db in "${dbs[@]}"; do | |
imp -Bq "use ${db}; show tables;" 2> /dev/null | head -n ${table_limit} | while read table; do | |
log "Fetching stats for ${db}.${table}..." | |
partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)" | |
echo "${partitions}" | grep -v "Total${delimiter}" | head -n ${partition_limit} | while read partition; do | |
write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")" | |
done | |
done | |
done | |
#create the impala table if it doesn't yet exist: | |
imp -Bq ' | |
DROP TABLE IF EXISTS '${partitions_table}'; | |
CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' ( | |
p_TABLE STRING, | |
P_LOCATION STRING, | |
P_INCREMENTAL BOOLEAN, | |
P_FORMAT STRING, | |
P_REPLICATION STRING, | |
P_CACHE STRING, | |
P_SIZE STRING, | |
P_FILES INT, | |
P_ROWS INT, | |
P_PARTITION STRING | |
) | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY '"'${delimiter}'"' | |
STORED AS TEXTFILE | |
LOCATION '"'${partitions_table_location}'"' | |
' | |
#insert into the impala table: | |
log 'Loading data into Impala...' | |
hdfs dfs -put -f ${output_file} ${partitions_table_location} | |
#refresh the impala table: | |
log 'Refreshing Impala...' | |
imp -Bq "REFRESH ${partitions_table}" | |
log "Load complete." | |
#create a view where the size data is meaningful: | |
imp -Bq ' | |
DROP VIEW IF EXISTS '${partitions_table_view}' | |
CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT | |
CASE p_size_unit | |
WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT) | |
WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT) | |
WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT) | |
WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT) | |
ELSE CAST(p_size AS INT) END p_size_bytes, | |
p_table, | |
p_location, | |
p_size, | |
p_files, | |
p_rows, | |
p_partition | |
FROM ( | |
SELECT | |
substring(p_size, 1, length(p_size) - 2) p_size_raw, | |
substring(p_size, length(p_size) - 1, 2) p_size_unit, | |
p_table, | |
p_location, | |
p_size, | |
p_files, | |
p_rows, | |
p_partition | |
FROM | |
db_navbi.partition_analysis | |
WHERE p_size IS NOT NULL AND length(p_size) > 2 | |
) partition_analysis_split | |
; | |
' | |
#example query: | |
# show the tables with the lowest file size with a reasonable number of files: | |
imp -q ' | |
select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25; | |
' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment