Skip to content

Instantly share code, notes, and snippets.

@kekcsi
Created February 15, 2020 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kekcsi/e60b9b6030149b125c02d292bc632c18 to your computer and use it in GitHub Desktop.
Save kekcsi/e60b9b6030149b125c02d292bc632c18 to your computer and use it in GitHub Desktop.
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
/**
* -- Big Q Sort --
*
* A variation of Quicksort algorithm for in-place sorting of data that does not completely fit into RAM.
*
* The basic idea of this generalization is that pointers used for the partitioning are file block indexes.
* A step of the original Quicksort algorithm takes two records to compare and conditionally swaps them.
* This algorithm on the other hand, will take two blocks of records to fill a buffer in memory.
* Instead of swapping, it runs a sorting algorithm on the buffer and then blocks are written back to the file
* before the corresponding pointer is moved.
*
* Pointers used for the partitioning are file block indexes but pivot is still a single record like in Quicksort.
*
* Otherwise the program flow is similar to that of Quicksort.
*
* -- This Implementation --
*
* Pivot selection saves block reads by selecting a record from the memory, which may not be an optimal pivot.
* If there are repeated keys, the actual record that plays the pivot role may change while partitioning in this
* implementation, it is only its key that needs to stay constant.
*
* In the demonstration, records are bytes of the file "sort_me.txt", key is the signed value, comparator is <
*
* The buffer is assumed to be smaller than the file.
*
* The output file will be a multiple of the block size, therefore program adds some null bytes and orders them. In a
* production implementation this would be bad; but there is no need to complicate this POC to prevent the issue.
*/
public class BigQ {
private static final int BLOCK_SIZE = 11;
private static byte[] buffer = new byte[2*BLOCK_SIZE];
private static final File SORT_ME = new File("sort_me.txt");
static class BlockInterval {
long low;
long high;
public BlockInterval(long low, long high) {
assert low < high;
this.low = low;
this.high = high;
}
}
public static void main(String[] args) throws IOException {
BlockFile blockFile = new BlockFile(SORT_ME, "rws", BLOCK_SIZE, buffer);
// divide et impera starts with the whole file as interval to sort
Deque<BlockInterval> openList = new ArrayDeque<>();
openList.push(new BlockInterval(0L, blockFile.nBlocks - 1));
do {
BlockInterval outerBounds = openList.poll();
// initial fill of buffer (both halves)
assert outerBounds.low < outerBounds.high; // assuming at least 2 blocks span
blockFile.readBlock(outerBounds.low, BufferHalf.LEFT);
blockFile.readBlock(outerBounds.high, BufferHalf.RIGHT);
Arrays.sort(buffer);
if (outerBounds.low + 1 < outerBounds.high) {
// interval that does not completely fit into one buffer - partitioning needed
long doneBelow = outerBounds.low;
long doneAbove = outerBounds.high;
// select pivot (does not affect correctness but efficiency)
int pivotIndex = BLOCK_SIZE - (int)System.currentTimeMillis()%2;
byte pivotKey = buffer[pivotIndex];
partitioning:
while (true) { // buffer is sorted at this point
if (pivotIndex >= BLOCK_SIZE) {
// left half of buffer contains sub-pivot keys only, pivot is in the right half
blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
doneBelow++;
if (doneBelow >= doneAbove) {
// done pointers collide, conclude partitioning by writing out the rest of records
blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
break partitioning;
}
blockFile.readBlock(doneBelow, BufferHalf.LEFT);
Arrays.sort(buffer);
// by restoring sortedness of the buffer, pivot may have moved some positions left
while (buffer[pivotIndex] > pivotKey) {
pivotIndex--;
}
} else {
// right half of buffer contains super-pivot keys only, pivot is in the left half
blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
doneAbove--;
if (doneAbove <= doneBelow) {
// done pointers collide, conclude partitioning by writing out the rest of records
blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
break partitioning;
}
blockFile.readBlock(doneAbove, BufferHalf.RIGHT);
Arrays.sort(buffer);
// by restoring sortedness of the buffer, pivot may have moved some positions right
while (buffer[pivotIndex] < pivotKey) {
pivotIndex++;
}
}
}
// found the final block for the pivot (may still move in the block since the block contains 0 or more
// lower keys than the pivot, 1 or more pivot keys, and 0 or more higher keys)
assert doneBelow == doneAbove;
// when all records of a block are settled, exclude that block from further processing
if (doneBelow == outerBounds.high && buffer[BLOCK_SIZE] == pivotKey) {
doneBelow--;
}
if (doneAbove == outerBounds.low && buffer[BLOCK_SIZE - 1] == pivotKey) {
doneAbove++;
}
// keep partly processed and undone blocks in circulation
if (doneBelow > outerBounds.low) {
openList.push(new BlockInterval(outerBounds.low, doneBelow));
}
if (doneAbove < outerBounds.high) {
openList.push(new BlockInterval(doneAbove, outerBounds.high));
}
} else {
// degenerate interval of 2 blocks - sort already done in memory, just persist the result
blockFile.writeBlock(outerBounds.low, BufferHalf.LEFT);
blockFile.writeBlock(outerBounds.high, BufferHalf.RIGHT);
}
} while (!openList.isEmpty());
blockFile.close();
}
}
enum BufferHalf {
LEFT(0),
RIGHT(1);
int i;
BufferHalf(int i) {
this.i = i;
}
}
class BlockFile extends RandomAccessFile {
long nBlocks;
private final int blockSize;
private final byte[] buffer;
// analysis stuff
static int readCount = 0, writeCount = 0;
public BlockFile(File file, String mode, int blockSize, byte[] buffer) throws FileNotFoundException {
super(file, mode);
this.blockSize = blockSize;
this.buffer = buffer;
nBlocks = (file.length() + blockSize - 1)/blockSize;
}
public void readBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
seek(blockSize*blockIndex);
read(buffer, blockSize*bufferHalf.i, blockSize);
readCount++;
}
public void writeBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
seek(blockSize*blockIndex);
write(buffer, blockSize*bufferHalf.i, blockSize);
writeCount++;
}
@Override
public void close() throws IOException {
super.close();
System.out.println("N = " + nBlocks);
System.out.println("N*log(N) = " + nBlocks*Math.log(nBlocks)/0.2);
System.out.println("N*N/2 = " + nBlocks*nBlocks/2);
System.out.println(readCount + " reads");
System.out.println(writeCount + " writes");
}
}
@kekcsi
Copy link
Author

kekcsi commented Feb 15, 2020

N: number of blocks
n: number of records per block
A: one continuous array of 2*n records in memory

OPENLIST is a list of pairs, initially contains one member: {0, N - 1}
sub:
    take {L0, R0} from OPENLIST
    L = L0
    R = R0

    read blocks L, R into A
    sort A in memory
    p = n                                         ;pivot index in A
    k = key of A[p]
    loop:                                         ;on entry, A is always sorted
        if p >= n:                                ;left half of A contains sub-pivot keys only
            store A[0...n - 1] in block L         ;release the half of A where pivot is not found
            L = L + 1                             ;pointer leaves the processed area
            if L >= R:                            ;pointers collide?
                store A[n...2*n - 1] in block R   ;persist the rest of records
                break loop                        ;partitioning done
            read block L into A[0...n - 1]        ;fill released half again from updated pointer
            sort A in memory                      ;restore ordering - pushes pivot left
            while A[p] > k: p = p - 1             ;find a pivot with the same key as before
        else:                                     ;right half of A contains super-pivot keys only
            store A[n...2*n - 1] in block R       ;do the opposite of the above
            R = R - 1
            if R <= L:
                store A[0...n - 1] in block L
                break loop
            read block R into A[n...2*n - 1]
            sort A in memory
            while A[p] < k: p = p + 1
        repeat loop until break
    if L = R0 and A[n] = k: L = L - 1             ;records before pivot are settled, skip that block
    if L > L0: append {L0, L} to OPENLIST
    if R = L0 and A[n - 1] = k: R = R + 1         ;records after pivot are settled, skip that block
    if R < R0: append {R, R0} to OPENLIST
    repeat sub until OPENLIST is empty

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment