Skip to content

Instantly share code, notes, and snippets.

Forked from leifwickland/HFileInputFormat.scala
Created August 14, 2012 19:05
Show Gist options
  • Save coltfred/3351776 to your computer and use it in GitHub Desktop.
Save coltfred/3351776 to your computer and use it in GitHub Desktop.
Allows an HFile to be used as the input to MapReduce.
import org.apache.hadoop.fs.Path
import{ HFile, HFileScanner }
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.mapreduce.{ JobContext, InputSplit, TaskAttemptContext, RecordReader }
import org.apache.hadoop.mapreduce.lib.input.{ FileInputFormat, FileSplit }
* A MapReduce InputFormat for HBase's HFile.
class HFileInputFormat extends FileInputFormat[ImmutableBytesWritable, KeyValue] {
override def isSplitable(context: JobContext, file: Path): Boolean = false
def createRecordReader(split: InputSplit, context: TaskAttemptContext): RecordReader[ImmutableBytesWritable, KeyValue] = {
new HFileRecordReader
private class HFileRecordReader extends RecordReader[ImmutableBytesWritable, KeyValue] {
private var reader: HFile.Reader = _
private var scanner: HFileScanner = _
private var entryNumber: Int = 0
def close {
if (reader != null) {
def getCurrentKey: ImmutableBytesWritable = new ImmutableBytesWritable(scanner.getKeyValue.getRow)
def getCurrentValue: KeyValue = scanner.getKeyValue
def getProgress: Float = entryNumber / 1.max(reader.getEntries).toFloat
def initialize(split: InputSplit, context: TaskAttemptContext) {
val path = split.asInstanceOf[FileSplit].getPath
val fs = org.apache.hadoop.fs.FileSystem.get(context.getConfiguration)
reader = new HFile.Reader(fs, path, null, false)
scanner = reader.getScanner(false, false)
reader.loadFileInfo // This is required or else seekTo throws a NPE
def nextKeyValue: Boolean = {
entryNumber += 1
if (!scanner.isSeeked)
// Had to move this here because "nextKeyValue" is called before the first getCurrentKey
// which was causing us to miss the first row of the HFile.
else {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment