Skip to content

Instantly share code, notes, and snippets.

@eric-maynard
Last active November 20, 2018 14:31
Show Gist options
  • Save eric-maynard/b7d3222d1aa9f760ddd4de05fdda5d04 to your computer and use it in GitHub Desktop.
Save eric-maynard/b7d3222d1aa9f760ddd4de05fdda5d04 to your computer and use it in GitHub Desktop.
A script to analyze a large volume of Impala partitions with an Impala table
#!/bin/bash
##################################################
# Constants #
##################################################
delimiter='~'
soft_delimiter=","
column_limit=9
##################################################
# Variables #
##################################################
#cluster specific:
alias imp=''
partitions_table='example.partition_analysis'
partitions_table_location='/example/partition_analysis'
partitions_table_view='example.partition_analysis_conformed'
#instance specific:
dbs=('db_one' 'db_two')
output_file='./eric.csv'
#safety values:
table_limit=2500
partition_limit=50000
##################################################
# Functions #
##################################################
#write to stdout with a timestamp
log () {
echo "[$(date +%s)] $1"
}
#write to an output file
write () {
echo "$1" >> ${output_file}
}
#split a string by its delimiter, reverse it, and recompose it with that delimiter
reverse_by_delimiter () {
echo "$1" | perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;'
}
#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters
trim_columns () {
echo "$1" | sed "s/${delimiter}/${soft_delimiter}/${column_limit}g"
}
##################################################
# Main script #
##################################################
#set up:
log "Setting up..."
rm -f ${output_file} 2> /dev/null
hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null
#loop through all impala tables in $db:
log "Collecting information on up to ${table_limit} tables..."
for db in "${dbs[@]}"; do
imp -Bq "use ${db}; show tables;" 2> /dev/null | head -n ${table_limit} | while read table; do
log "Fetching stats for ${db}.${table}..."
partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)"
echo "${partitions}" | grep -v "Total${delimiter}" | head -n ${partition_limit} | while read partition; do
write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")"
done
done
done
#create the impala table if it doesn't yet exist:
imp -Bq '
DROP TABLE IF EXISTS '${partitions_table}';
CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' (
p_TABLE STRING,
P_LOCATION STRING,
P_INCREMENTAL BOOLEAN,
P_FORMAT STRING,
P_REPLICATION STRING,
P_CACHE STRING,
P_SIZE STRING,
P_FILES INT,
P_ROWS INT,
P_PARTITION STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '"'${delimiter}'"'
STORED AS TEXTFILE
LOCATION '"'${partitions_table_location}'"'
'
#insert into the impala table:
log 'Loading data into Impala...'
hdfs dfs -put -f ${output_file} ${partitions_table_location}
#refresh the impala table:
log 'Refreshing Impala...'
imp -Bq "REFRESH ${partitions_table}"
log "Load complete."
#create a view where the size data is meaningful:
imp -Bq '
DROP VIEW IF EXISTS '${partitions_table_view}'
CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT
CASE p_size_unit
WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT)
WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT)
WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT)
WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT)
ELSE CAST(p_size AS INT) END p_size_bytes,
p_table,
p_location,
p_size,
p_files,
p_rows,
p_partition
FROM (
SELECT
substring(p_size, 1, length(p_size) - 2) p_size_raw,
substring(p_size, length(p_size) - 1, 2) p_size_unit,
p_table,
p_location,
p_size,
p_files,
p_rows,
p_partition
FROM
db_navbi.partition_analysis
WHERE p_size IS NOT NULL AND length(p_size) > 2
) partition_analysis_split
;
'
#example query:
# show the tables with the lowest file size with a reasonable number of files:
imp -q '
select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25;
'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment