Skip to content

Instantly share code, notes, and snippets.

@omalley
Last active August 31, 2018 13:44
Show Gist options
  • Save omalley/75093e104381ab9d157313993afcbbdf to your computer and use it in GitHub Desktop.
Save omalley/75093e104381ab9d157313993afcbbdf to your computer and use it in GitHub Desktop.
package org.apache.orc.examples;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.OrcFile;
import org.apache.orc.TypeDescription;
import org.apache.orc.Writer;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
public class CoreWriter {
public static void main(String[] args) throws IOException {
Configuration conf = new Configuration();
TypeDescription schema =
TypeDescription.fromString("struct<x:int,y:string>");
Writer writer = OrcFile.createWriter(new Path("my-file.orc"),
OrcFile.writerOptions(conf)
.setSchema(schema));
VectorizedRowBatch batch = schema.createRowBatch();
LongColumnVector x = (LongColumnVector) batch.cols[0];
BytesColumnVector y = (BytesColumnVector) batch.cols[1];
for(int r=0; r < 10000; ++r) {
int row = batch.size++;
x.vector[row] = r;
byte[] buffer = ("Last-" + (r * 3)).getBytes(StandardCharsets.UTF_8);
y.setRef(row, buffer, 0, buffer.length);
// If the batch is full, write it out and start over.
if (batch.size == batch.getMaxSize()) {
writer.addRowBatch(batch);
batch.reset();
}
}
if (batch.size != 0) {
writer.addRowBatch(batch);
}
writer.close();
}
}
@sergeisolarev
Copy link

Code doesn't work without {HADOOP}\bin\winutils.exe

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment