Skip to content

Instantly share code, notes, and snippets.

@iconara
Last active June 14, 2018 13:23
Show Gist options
  • Save iconara/6f2a95d589b54202156a73f5bf0558ae to your computer and use it in GitHub Desktop.
Save iconara/6f2a95d589b54202156a73f5bf0558ae to your computer and use it in GitHub Desktop.
Quick and dirty script to find spurious files in the prefix of a Glue table
require 'aws-sdk-glue'
require 'aws-sdk-s3'
def split_s3_uri(s3_uri)
s3_uri.match(%r{\As3://(.+?)/(.+)\z}).to_a.drop(1)
end
database, table_name = ARGV.take(2)
glue = Aws::Glue::Client.new
s3 = Aws::S3::Client.new
valid_prefixes = []
table_response = glue.get_table(database_name: database, name: table_name)
exit if table_response.table.partition_keys.empty?
table_location = table_response.table.storage_descriptor.location
table_bucket, table_prefix = split_s3_uri(table_location)
table_prefix << '/' unless table_prefix.end_with?('/')
partitions_response = glue.get_partitions(database_name: database, table_name: table_name)
loop do
partitions_response.partitions.each do |partition|
prefix = partition.storage_descriptor.location
bucket, prefix = split_s3_uri(partition.storage_descriptor.location)
if bucket == table_bucket && prefix.start_with?(table_prefix)
prefix += '/' unless prefix.end_with?('/')
valid_prefixes << prefix
else
$stderr.puts("Partition location outside table prefix! (#{partition.storage_descriptor.location} not in #{table_location})")
end
end
if partitions_response.next_page?
partitions_response = partitions_response.next_page
else
break
end
end
listing_response = s3.list_objects_v2(bucket: table_bucket, prefix: table_prefix)
loop do
listing_response.contents.each do |obj|
unless valid_prefixes.any? { |prefix| obj.key.start_with?(prefix) }
$stdout.puts(obj.key)
end
end
if listing_response.next_page?
listing_response = listing_response.next_page
else
break
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment