eric-maynard/create-partition-table.sh

## create-partition-table.sh
#!/bin/bash

##################################################
#                   Constants                    #
##################################################

delimiter='~'
soft_delimiter=","
column_limit=9

##################################################
#                   Variables                    #
##################################################

#cluster specific:
alias imp=''
partitions_table='example.partition_analysis'
partitions_table_location='/example/partition_analysis'
partitions_table_view='example.partition_analysis_conformed'

#instance specific:
dbs=('db_one' 'db_two')
output_file='./eric.csv'

#safety values:
table_limit=2500
partition_limit=50000

##################################################
#                   Functions                    #
##################################################

#write to stdout with a timestamp
log () {
  echo "[$(date +%s)] $1"
}

#write to an output file
write () {
  echo "$1" >> ${output_file}
}

#split a string by its delimiter, reverse it, and recompose it with that delimiter
reverse_by_delimiter () {
  echo "$1" | perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;'
}

#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters
trim_columns () {
  echo "$1" | sed "s/${delimiter}/${soft_delimiter}/${column_limit}g"
}

##################################################
#                 Main script                    #
##################################################

#set up:
log "Setting up..."
rm -f ${output_file} 2> /dev/null
hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null

#loop through all impala tables in $db:
log "Collecting information on up to ${table_limit} tables..."
for db in "${dbs[@]}"; do
  imp -Bq "use ${db}; show tables;" 2> /dev/null | head -n ${table_limit} | while read table; do
    log "Fetching stats for ${db}.${table}..."
    partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)"
    echo "${partitions}" | grep -v "Total${delimiter}" | head -n ${partition_limit} | while read partition; do
  	  write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")"
    done
  done
done

#create the impala table if it doesn't yet exist:
imp -Bq '
  DROP TABLE IF EXISTS '${partitions_table}';
  CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' (
    p_TABLE STRING,
	P_LOCATION STRING,
    P_INCREMENTAL BOOLEAN,
    P_FORMAT STRING,
    P_REPLICATION STRING,
    P_CACHE STRING,
    P_SIZE STRING,
    P_FILES INT,
    P_ROWS INT,
    P_PARTITION STRING
  )
  ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '"'${delimiter}'"'
  STORED AS TEXTFILE
  LOCATION '"'${partitions_table_location}'"'
  '

#insert into the impala table:
log 'Loading data into Impala...'
hdfs dfs -put -f ${output_file} ${partitions_table_location}

#refresh the impala table:
log 'Refreshing Impala...'
imp -Bq "REFRESH ${partitions_table}"

log "Load complete."

#create a view where the size data is meaningful:
imp -Bq '
 DROP VIEW IF EXISTS '${partitions_table_view}'
 CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT
  CASE p_size_unit
   WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT)
   WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT)
   WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT)
   WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT)
   ELSE CAST(p_size AS INT) END p_size_bytes,
   p_table,
   p_location,
   p_size,
   p_files,
   p_rows,
   p_partition
  FROM (
   SELECT
     substring(p_size, 1, length(p_size) - 2) p_size_raw,
     substring(p_size, length(p_size) - 1, 2) p_size_unit,
 	p_table,
     p_location,
     p_size,
     p_files,
     p_rows,
     p_partition
   FROM
 	db_navbi.partition_analysis
   WHERE p_size IS NOT NULL AND length(p_size) > 2
 ) partition_analysis_split
;
'

#example query:
# show the tables with the lowest file size with a reasonable number of files:
imp -q '
select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25;
'
	#!/bin/bash

	##################################################
	# Constants #
	##################################################

	delimiter='~'
	soft_delimiter=","
	column_limit=9

	##################################################
	# Variables #
	##################################################

	#cluster specific:
	alias imp=''
	partitions_table='example.partition_analysis'
	partitions_table_location='/example/partition_analysis'
	partitions_table_view='example.partition_analysis_conformed'

	#instance specific:
	dbs=('db_one' 'db_two')
	output_file='./eric.csv'

	#safety values:
	table_limit=2500
	partition_limit=50000

	##################################################
	# Functions #
	##################################################

	#write to stdout with a timestamp
	log () {
	echo "[$(date +%s)] $1"
	}

	#write to an output file
	write () {
	echo "$1" >> ${output_file}
	}

	#split a string by its delimiter, reverse it, and recompose it with that delimiter
	reverse_by_delimiter () {
	echo "$1" \| perl -lne 'print join "'${delimiter}'", reverse split/\'${delimiter}'/;'
	}

	#replace all instances of a delmiter with a soft delmiter after a certain amount of delimiters
	trim_columns () {
	echo "$1" \| sed "s/${delimiter}/${soft_delimiter}/${column_limit}g"
	}

	##################################################
	# Main script #
	##################################################

	#set up:
	log "Setting up..."
	rm -f ${output_file} 2> /dev/null
	hdfs dfs -mkdir -p ${partitions_table_location} 2> /dev/null

	#loop through all impala tables in $db:
	log "Collecting information on up to ${table_limit} tables..."
	for db in "${dbs[@]}"; do
	imp -Bq "use ${db}; show tables;" 2> /dev/null \| head -n ${table_limit} \| while read table; do
	log "Fetching stats for ${db}.${table}..."
	partitions="$(imp --output_delimiter='~' -Bq 'show partitions '${db}'.'${table}';' 2> /dev/null)"
	echo "${partitions}" \| grep -v "Total${delimiter}" \| head -n ${partition_limit} \| while read partition; do
	write "${db}.${table}${delimiter}$(trim_columns "$(reverse_by_delimiter "${partition}")")"
	done
	done
	done

	#create the impala table if it doesn't yet exist:
	imp -Bq '
	DROP TABLE IF EXISTS '${partitions_table}';
	CREATE EXTERNAL TABLE IF NOT EXISTS '${partitions_table}' (
	p_TABLE STRING,
	P_LOCATION STRING,
	P_INCREMENTAL BOOLEAN,
	P_FORMAT STRING,
	P_REPLICATION STRING,
	P_CACHE STRING,
	P_SIZE STRING,
	P_FILES INT,
	P_ROWS INT,
	P_PARTITION STRING
	)
	ROW FORMAT DELIMITED
	FIELDS TERMINATED BY '"'${delimiter}'"'
	STORED AS TEXTFILE
	LOCATION '"'${partitions_table_location}'"'
	'

	#insert into the impala table:
	log 'Loading data into Impala...'
	hdfs dfs -put -f ${output_file} ${partitions_table_location}

	#refresh the impala table:
	log 'Refreshing Impala...'
	imp -Bq "REFRESH ${partitions_table}"

	log "Load complete."

	#create a view where the size data is meaningful:
	imp -Bq '
	DROP VIEW IF EXISTS '${partitions_table_view}'
	CREATE VIEW IF NOT EXISTS '${partitions_table_view}' AS SELECT
	CASE p_size_unit
	WHEN 'KB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 AS INT)
	WHEN 'MB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 AS INT)
	WHEN 'GB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 AS INT)
	WHEN 'TB' THEN CAST(CAST(p_size_raw AS DOUBLE) * 1024 * 1024 * 1024 * 1024 AS INT)
	ELSE CAST(p_size AS INT) END p_size_bytes,
	p_table,
	p_location,
	p_size,
	p_files,
	p_rows,
	p_partition
	FROM (
	SELECT
	substring(p_size, 1, length(p_size) - 2) p_size_raw,
	substring(p_size, length(p_size) - 1, 2) p_size_unit,
	p_table,
	p_location,
	p_size,
	p_files,
	p_rows,
	p_partition
	FROM
	db_navbi.partition_analysis
	WHERE p_size IS NOT NULL AND length(p_size) > 2
	) partition_analysis_split
	;
	'

	#example query:
	# show the tables with the lowest file size with a reasonable number of files:
	imp -q '
	select * from (select p_table, avg(avg_file_size) avg_file_size_table, sum(p_files) sum_files, count(*) count_partitions from (select p_table, p_files, (p_size_bytes / p_files) as avg_file_size from db_navbi.partition_analysis_conformed) avgs group by p_table) tbl_avgs where sum_files > 25 order by avg_file_size_table limit 25;
	'