Skip to content

Instantly share code, notes, and snippets.

@cbaenziger
Last active August 29, 2015 14:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cbaenziger/1275ef3aba2fc3384147 to your computer and use it in GitHub Desktop.
Save cbaenziger/1275ef3aba2fc3384147 to your computer and use it in GitHub Desktop.
require 'set'
include Java
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.HConstants
import org.apache.hadoop.hbase.HTableDescriptor
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.io.Text
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import java.util.NoSuchElementException
import java.io.FileNotFoundException
# Return a Hash of region UUIDs to hostnames with column family stubs
#
# tableName - table to return regions for
#
# Example
# getRegionUUIDs "TestTable"
# # => {"3fe594363a2c13a3550f752db147194b"=>{"host" => "r1n1.example.com", "cfs" => {"f1" => {}, "f2" => {}},
# "da19a80cc403daa9a8f82ac9a1253e9d"=>{"host" => "r1n2.example.com", "cfs" => {"f1" => {}, "f2" => {}}}}
#
def getRegionUUIDs(tableName)
c = HBaseConfiguration.new()
tableNameObj = TableName.valueOf(tableName)
t = HTable.new(c, tableNameObj)
regions = t.getRegionsInRange(t.getStartKeys[0],
t.getEndKeys[t.getEndKeys.size-1])
# get all column families -- XXX do all regions have to host all CF's?
cfs = HTable.new(c, tableNameObj).getTableDescriptor.getFamilies().map{ |cf| cf.getNameAsString() }
r_to_host = regions.map{|r| [r.getRegionInfo().getEncodedName(), Hash["host" => r.getHostname(), "cfs" => Hash[cfs.map{|cf| [cf, Hash.new()] }]]] }
Hash[r_to_host]
end
def findHDFSBlocks(regions, tableName)
# augment regions with HDFS block locations
augmented = regions.clone
c = HBaseConfiguration.new()
fs = FileSystem.newInstance(c)
hbase_rootdir = c.select{|r| r.getKey() == "hbase.rootdir"}.first.getValue
tableNameObj = TableName.valueOf(tableName)
nameSpace = tableNameObj.getNamespaceAsString
baseTableName = tableNameObj.getQualifierAsString
# use the default namespace if nongiven
nameSpace = "default" if nameSpace == tableName
regions.each do |r, values|
values["cfs"].keys().each do |cf|
rPath = Path.new(Pathname.new(hbase_rootdir).join("data", nameSpace, baseTableName, r, cf).to_s)
begin
files = fs.listFiles(rPath, true)
rescue java.io.FileNotFoundException
next
end
begin
begin
fStatus = files.next()
hosts = fStatus.getBlockLocations().map { |block| Set.new(block.getHosts().to_a) }
augmented[r]["cfs"][cf][File.basename(fStatus.getPath().toString())] = hosts
rescue NativeException, java.util.NoSuchElementException
fStatus = false
end
end until fStatus == false
end
end
augmented
end
def computeLocalityByBlock(regions)
non_local_blocks = []
regions.each do |r, values|
values["cfs"].each do |cf, hFiles|
hFiles.each do |id, blocks|
blocks.each_index do |idx|
non_local_blocks.push(Pathname.new(r).join(cf, id, idx.to_s).to_s) unless blocks[idx].include?(values["host"])
end
end
end
end
non_local_blocks
end
def totalBlocks(regions)
regions.map do |r, values|
values["cfs"].map do |cf, hFiles|
hFiles.map do |id, blocks|
blocks.count
end
end
end.flatten().reduce(0, :+)
end
tables = list
tables.each do |tableName|
puts tableName
begin
regions = getRegionUUIDs(tableName)
hdfs_blocks_by_region = findHDFSBlocks(regions, tableName)
non_local_blocks = computeLocalityByBlock(hdfs_blocks_by_region)
total_blocks = totalBlocks(hdfs_blocks_by_region)
puts non_local_blocks.length().to_f/total_blocks if total_blocks > 0 # e.g. if table not empty or disabled
rescue org.apache.hadoop.hbase.TableNotFoundException
true
end
end
@cbaenziger
Copy link
Author

Initial working version which calculates region locality by block count (not by size of the data)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment