Last active
September 28, 2022 07:50
-
-
Save dvimont/3b9b1e48808ebfc53f13 to your computer and use it in GitHub Desktop.
HBase shell emulator (an aid to understanding the HBase data model)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.TreeMap; | |
import java.util.concurrent.TimeUnit; | |
/** | |
* This code was created to help its author to internalize and more deeply | |
* understand the fundamental constructs of the Data Model of the HBase | |
* database system. It is provided here in the event that it may prove useful | |
* to others engaging in similar explorations. The descriptive comments that | |
* follow may prove helpful, and those with basic Java programming skills may | |
* also find it useful to experiment with the create, list, put, scan, and get | |
* methods of the main class below, all of which are meant to emulate the | |
* corresponding commands available with the HBase shell, as documented here: | |
* https://hbase.apache.org/book.html#shell_exercises. | |
* Also, find further commentary/discussion on HBase (and the "Chaos Wrangler" | |
* project) on the author's blog: http://dvimont.blogspot.com/ | |
* ---------------------------------------------------------------------------- | |
* The following is a Java-oriented analogy to the structures that comprise | |
* the HBase data model. It is presented under the assumption that the reader | |
* has a basic mastery of both Java and RDBMS concepts. | |
* | |
* The task of coming to an understanding of HBase data structures is | |
* unfortunately made much more difficult than it otherwise might be, due to | |
* the fact that HBase uses the terms TABLE, COLUMN, and ROW, but the structures | |
* that are referred to by these names bear little resemblance to their | |
* RDBMS namesakes. | |
* | |
* In fact, an HBase ROW is completely misnamed to the extent that the common | |
* term "row" is usually associated with a one-dimensional container of | |
* singular instances of data, whether it be a row in a spreadsheet or a row | |
* in an RDBMS table. In stark contrast, an HBase ROW could be conceived of | |
* as a container of an array of arrays of arrays!! (No wonder people run into | |
* difficulty understanding HBase data structures!) | |
* | |
* The following is a conceptual representation of the hierarchy of structures | |
* in an HBase table. (Note that all relationships shown below are one-to-many | |
* -- e.g., one TABLE contains multiple COLUMN_FAMILY_DEFINITION instances, one | |
* CELL contains multiple CELL_ENTRY instances, etc.): | |
* | |
* [[CONCEPTUAL DIAGRAM OF HBase TABLE & its components]] | |
* | |
* TABLE | |
* || | |
* ||=========>> COLUMN_FAMILY_DEFINITION (immutable) | |
* || | [[metadata realm]] | |
*--||---------------------|---------------------------------------------------- | |
* || | [[data realm]] | |
* ||==>> ROW | | |
* || | | |
* ||==>> COLUMN_FAMILY instance (corresponds to one of Table's CF-definitions) | |
* || | |
* ||==>> CELL (identified by a COLUMN_QUALIFIER (byte-array)) | |
* || | |
* ||==>> CELL_ENTRY = [VERSION (timestamp) & VALUE (byte-array)] | |
* | |
* | |
* A TABLE and its component COLUMN_FAMILY_DEFINITIONs are immutably defined when | |
* the TABLE is created. (While a TABLE could theoretically have an unlimited | |
* number of COLUMN_FAMILY_DEFINITIONs, the official Apache HBase reference says | |
* that the physical realities of the HBase architecture enforce a practical | |
* maximum of no more than two or three per TABLE!) | |
* All lower-level constructs (from ROW on down) are created and maintained | |
* by an application via the HBase "put" method. Thus, crucially, all of | |
* these lower-level constructs (including so-called COLUMN_QUALIFIERs) are | |
* treated as application-managed DATA, and not as database-managed METADATA!! | |
* | |
* There are a number of immediately apparent ramifications of this "shifting" | |
* of column-name-maintenance (not to mention column-datatype-maintenance) from | |
* the metadata-realm to the data-realm: not only does this potentially upend | |
* certain basic assumptions about the division between application development | |
* and metadata management, but it could also call for a redefining/realignment | |
* of classical roles within IT organizations, which may otherwise be accustomed | |
* to an explicit separation of duties between DBAs (who are the traditional | |
* custodians of metadata) and developers (who are traditionally responsible | |
* for building applications that manipulate only data - not metadata!). | |
* | |
* @author Daniel Vimont | |
*/ | |
public class HBaseShellSimpleEmulator { | |
String namespaceName; | |
TreeMap<String,Table> tables = new TreeMap<>(); | |
// Note: HBase tables, rows, etc. have MANY attributes that are not modelled here. | |
// These classes comprise a "rough sketch" of HBase structures for learning purposes. | |
public class Table { | |
final String TABLE_NAME; | |
TreeMap<String,ColumnFamilyDefinition> columnFamilyDefinitions = new TreeMap<>(); | |
TreeMap<ByteArray,Row> rows = new TreeMap<>(); | |
public Table (String tableName, String... columnFamilyPrefixes) { | |
this.TABLE_NAME = tableName; | |
for (String columnFamilyPrefix : columnFamilyPrefixes) { | |
columnFamilyDefinitions.put | |
(columnFamilyPrefix, new ColumnFamilyDefinition(columnFamilyPrefix,3)); | |
} | |
} | |
public void put (byte[] rowKey, byte[]... columnVarArgPairs) { | |
Row row = new Row(rowKey, columnVarArgPairs); | |
this.rows.put(row.ROW_KEY, row); | |
} | |
public class ColumnFamilyDefinition { | |
String COLUMN_FAMILY_PREFIX; | |
int maxVersions; // maximum cell-entry versions to be permitted (modifiable) | |
public ColumnFamilyDefinition (String columnFamilyPrefix, int maxVersions) { | |
this.COLUMN_FAMILY_PREFIX = columnFamilyPrefix; | |
this.maxVersions = maxVersions; | |
} | |
} | |
public class Row { | |
final ByteArray ROW_KEY; | |
TreeMap<String,ColumnFamily> columnFamilies = new TreeMap<>(); | |
public Row (byte[] rowKey, byte[]... columnVarArgPairs) { | |
boolean invalidColumnEntry = false; | |
String invalidMsg = ""; | |
int varArgPosition = 0; | |
// For every pair of varArg byte-arrays submitted, the first of the | |
// pair is the colon-delimited columnId (column-family-prefix:column-qualifer) | |
// that uniquely identifies a cell in the row; the second is the | |
// value to be put in the cell. | |
while (varArgPosition + 2 <= columnVarArgPairs.length) { | |
String[] columnIdPair | |
= new String(columnVarArgPairs[varArgPosition]).split(":"); | |
if (columnIdPair.length != 2) { | |
invalidMsg = "Invalid Column ID submitted: " | |
+ columnVarArgPairs[varArgPosition]; | |
break; | |
} | |
String columnFamilyPrefix = columnIdPair[0]; | |
String columnQualifier = columnIdPair[1]; | |
if (!columnFamilyDefinitions.containsKey(columnFamilyPrefix)) { | |
invalidMsg = "Invalid Column PREFIX submitted: " + columnFamilyPrefix; | |
break; | |
} | |
ColumnFamily columnFamily = columnFamilies.get(columnFamilyPrefix); | |
if (columnFamily == null) { | |
columnFamily | |
= new ColumnFamily(columnFamilyDefinitions.get(columnFamilyPrefix)); | |
} | |
columnFamily.put(columnQualifier.getBytes(), columnVarArgPairs[varArgPosition+1]); | |
columnFamilies.put(columnFamilyPrefix, columnFamily); | |
varArgPosition += 2; | |
} | |
if (invalidColumnEntry) { | |
this.ROW_KEY = null; | |
System.out.println("PUT failed -- " + invalidMsg); | |
} else { | |
this.ROW_KEY = new ByteArray(rowKey); | |
} | |
} | |
public class ColumnFamily { | |
final ColumnFamilyDefinition COLUMN_FAMILY_DEFINITION; | |
TreeMap<ByteArray,Cell> cells = new TreeMap<>(); | |
public ColumnFamily (ColumnFamilyDefinition columnFamilyDefinition) { | |
this.COLUMN_FAMILY_DEFINITION = columnFamilyDefinition; | |
} | |
public void put (byte[] columnQualifier, byte[] value) { | |
Cell cell = cells.get(new ByteArray(columnQualifier)); | |
if (cell == null) { | |
cell = new Cell(columnQualifier); | |
} | |
cell.put(value); | |
cells.put(cell.COLUMN_QUALIFIER, cell); | |
} | |
public class Cell { | |
final ByteArray COLUMN_QUALIFIER; | |
TreeMap<Long,CellEntry> cellEntries = new TreeMap<>(); // versioned cellEntries | |
public Cell (byte[] columnQualifier) { | |
COLUMN_QUALIFIER = new ByteArray(columnQualifier); | |
} | |
public void put (byte[] value) { | |
CellEntry cellEntry = new CellEntry(value); | |
// Cell entries are ordered with most recent entries first. | |
cellEntries.put(Long.MAX_VALUE - cellEntry.VERSION, cellEntry); | |
} | |
public class CellEntry { | |
final long VERSION; | |
final ByteArray VALUE; // raw data of any format | |
public CellEntry (long version, byte[] value) { | |
this.VERSION = version; | |
this.VALUE = new ByteArray(value); | |
} | |
public CellEntry (byte[] value) { | |
// version defaults to current timestamp | |
this(System.currentTimeMillis(), value); | |
// assure subsequent CellEntries have different timestamp | |
try {TimeUnit.MILLISECONDS.sleep(1L);} | |
catch (InterruptedException e) {}; | |
} | |
} | |
} | |
} | |
} | |
} | |
public class ByteArray implements Comparable { | |
final private byte[] BYTE_ARRAY; | |
public ByteArray (byte[] byteArray) { | |
this.BYTE_ARRAY = byteArray; | |
} | |
@Override | |
public String toString() { | |
return new String(this.BYTE_ARRAY); | |
} | |
@Override | |
public int compareTo(Object o) { | |
ByteArray other = (ByteArray)o; | |
for (int i = 0, j = 0; i < this.BYTE_ARRAY.length | |
&& j < other.BYTE_ARRAY.length; i++, j++) { | |
int a = (this.BYTE_ARRAY[i] & 0xff); | |
int b = (other.BYTE_ARRAY[j] & 0xff); | |
if (a != b) { | |
return a - b; | |
} | |
} | |
return this.BYTE_ARRAY.length - other.BYTE_ARRAY.length; | |
} | |
} | |
//============================================================================ | |
// Emulation of some HBase shell commands: | |
// These methods roughly emulate the HBase shell commands of the same names. | |
//============================================================================ | |
public void create (String tableName, String... columnFamilyPrefixes) { | |
Table table = new Table(tableName, columnFamilyPrefixes); | |
tables.put(tableName, table); | |
System.out.println("Table " + tableName | |
+ " has been created with the following column families..."); | |
printColumnFamilies(table); | |
System.out.println(); | |
} | |
public void list () { | |
System.out.println("TABLE LIST\n=========="); | |
for (Table table : tables.values()) { | |
System.out.println(table.TABLE_NAME); | |
printColumnFamilies(table); | |
} | |
System.out.println(); | |
} | |
public void printColumnFamilies (Table table) { | |
System.out.println(" Column families:"); | |
for (Table.ColumnFamilyDefinition columnFamilyDefinition | |
: table.columnFamilyDefinitions.values()) { | |
System.out.println(" " + columnFamilyDefinition.COLUMN_FAMILY_PREFIX); | |
} | |
} | |
public void put (String tableName, byte[] rowKey, byte[]... columnVarArgPairs) { | |
Table table = tables.get(tableName); | |
if (table == null) { | |
System.out.println("Invalid table name: " + tableName); | |
return; | |
} | |
table.put(rowKey, columnVarArgPairs); | |
System.out.println("PUT to table [" + tableName + "] row [" | |
+ new String(rowKey) + "] completed.\n"); | |
} | |
public void scan (String tableName) { | |
Table table = tables.get(tableName); | |
if (table == null) { | |
System.out.println("Table not found: " + tableName); | |
return; | |
} | |
System.out.println("Scan of table: " + tableName); | |
for (Table.Row row : table.rows.values()) { | |
printRow(row); | |
} | |
System.out.println(); | |
} | |
public void get (String tableName, byte[] rowKey) { | |
Table table = tables.get(tableName); | |
if (table == null) { | |
System.out.println("Table not found: " + tableName); | |
return; | |
} | |
Table.Row row = table.rows.get(new ByteArray(rowKey)); | |
if (row == null) { | |
System.out.println("Row: " + new String(rowKey) | |
+ " not found on table: " + tableName); | |
return; | |
} | |
System.out.println("Get row from table: " + tableName); | |
printRow(row); | |
System.out.println(); | |
} | |
public void printRow (Table.Row row) { | |
System.out.println(" Row: " + row.ROW_KEY); | |
for (Table.Row.ColumnFamily columnFamily : row.columnFamilies.values()) { | |
if (columnFamily.cells.isEmpty()) { | |
continue; | |
} | |
System.out.println(" Column Family: " | |
+ columnFamily.COLUMN_FAMILY_DEFINITION.COLUMN_FAMILY_PREFIX); | |
for (Table.Row.ColumnFamily.Cell cell : columnFamily.cells.values()) { | |
System.out.println(" Cell: " | |
+ columnFamily.COLUMN_FAMILY_DEFINITION.COLUMN_FAMILY_PREFIX | |
+ ":" + cell.COLUMN_QUALIFIER); | |
System.out.println(" Cell Entries (most recent version on top)"); | |
System.out.println(" ========================================="); | |
int versionCount = 0; | |
for (Table.Row.ColumnFamily.Cell.CellEntry cellEntry : cell.cellEntries.values()) { | |
if (++versionCount > columnFamily.COLUMN_FAMILY_DEFINITION.maxVersions) { | |
break; // Version entries exceeding maxVersions are "hidden". | |
} | |
System.out.println(" Version: [" + cellEntry.VERSION | |
+ "] Value: [" + new String(cellEntry.VALUE.BYTE_ARRAY) + "]"); | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment