dmagda/join_algorithm_types.md

## join_algorithm_types.md

      
    Raw
  

              join_algorithm_types.md
            
          
    Nested Loop Join

// Loading records that satisfy the WHERE predicate.
// Can be an index or full table scan.
var subsetTable1 = getData(table1, wherePredicate);
var subsetTable2 = getData(table2, wherePredicate);

var resultSet = new LinkedList();

// Doing an inner join.
// Comparing each row from subsetTable1 and subsetTable2
for each (row1 in subsetTable1) {
    for each (row2 in subsetTable2) {
        if (canJoin(row1, row2, joinPredicate)) {
            result.add(new Result(row1,row2);
	}
    }
}
The algorithmic complexity of the join phase/loop is O(n*m), where:

n is the size of the subsetTable1
m is the size of the subsetTable2

Hash Join

// Loading records that satisfy the WHERE predicate.
// Can be an index or full table scan.
var subsetTable1 = getData(table1, wherePredicate);
var subsetTable2 = getData(table2, wherePredicate);

var hashTable1 = new HashTable();

// Creating a hastable from records of subsetTable1
// using the join predicate as a key
for each (row1 in subsetTable1) {
    hashTable1.put(getKey(row1, joinPredicate), row1)  
}

var resultSet = new LinkedList();

// Doing the inner join by comparing each record from
// subsetTable2 against the hashtable
for each (row2 in subsetTable2) {
    var rowsSet1 = hashTable1.get(getKey(row2, joinPredicate));
  
    // There might be several rows from table1 that 
    // satisfy the join predicate
    if (rowsSet1 != null) {
        for each (row1 in rowsSet1) {
            resultSet.add(new Result(row1,row2);
        }
    }
}
The algorithmic complexity of the join phase/loop is O(m+k), where:

m is the size of the subsetTable2
k is the size of the values from rowsSet1 that satisfy the same join predicate. The worst case scenario is n which is the whole subsetTable1

Sort-Merge Join

// Loading records that satisfy the WHERE predicate.
// Can be an index or full table scan.
var subsetTable1 = getData(table1, wherePredicate);
var subsetTable2 = getData(table2, wherePredicate);

// Sorting data using the orderKey (usually, the key
// of the ORDER BY clause)
var sortedTable1 = sort(subsetTable1, orderKey);
var sortedTable2 = sort(subsetTable2, orderKey);

var resultSet = new LinkedList();

// Doing the inner join by merging two data sets
while (sortedTable1.hasNext() || sortedTable2.hasNext()) {
    var row1 = sortedTable1.currentRow();
    var row2 = sortedTable2.currentRow();
  	
    if (row1.orderKey == row2.orderKey) {
        resultSet.add(new Result(row1, row2));
      	sortedTable1.goToNextRow();
      	sortedTable2.goToNextRow();
      
    } else if (row1.orderKey < row2.orderKey) {
      	sortedTable1.goToNextRow();
      
    } else { // row1.orderKey > row2.orderKey
      	sortedTable2.goToNextRow();
    }
}
The algorithmic complexity of the join phase/loop is O(n+m), where:

n is the size of the subsetTable1
m is the size of the subsetTable2