Skip to content

Instantly share code, notes, and snippets.

@dain
Last active December 10, 2017 02:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save dain/e931a43b3463136fd7bf to your computer and use it in GitHub Desktop.
Save dain/e931a43b3463136fd7bf to your computer and use it in GitHub Desktop.
Simple program to scan an ORC file
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.orc;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.Type;
import com.facebook.presto.testing.TestingConnectorSession;
import com.google.common.collect.ImmutableMap;
import io.airlift.units.DataSize;
import org.joda.time.DateTimeZone;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;
import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import static com.facebook.presto.spi.type.VarcharType.VARCHAR;
import static io.airlift.units.DataSize.Unit.MEGABYTE;
@SuppressWarnings("UseOfSystemOutOrSystemErr")
public final class ScanOrcFile
{
public static void main(String... args)
throws Exception
{
File file = new File(args[0]);
FileOrcDataSource orcDataSource = new FileOrcDataSource(file, new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE), new DataSize(8, MEGABYTE));
OrcReader orcReader = new OrcReader(orcDataSource, new OrcMetadataReader(), new DataSize(1, MEGABYTE), new DataSize(8, MEGABYTE));
//
// Set your column types here
//
Map<Integer, Type> columnTypes = ImmutableMap.<Integer, Type>builder()
.put(0, BIGINT)
.put(1, DOUBLE)
.put(2, VARCHAR)
.build();
OrcRecordReader recordReader = orcReader.createRecordReader(columnTypes, OrcPredicate.TRUE, DateTimeZone.getDefault());
long rows = 0;
for (int batchSize = recordReader.nextBatch(); batchSize > 0; batchSize = recordReader.nextBatch()) {
rows += readBatch(columnTypes, recordReader);
}
System.out.println();
System.out.println("rows: " + rows);
}
private static int readBatch(Map<Integer, Type> columnTypes, OrcRecordReader recordReader)
throws IOException
{
int batchSize = recordReader.nextBatch();
for (Entry<Integer, Type> entry : columnTypes.entrySet()) {
Block block = recordReader.readBlock(entry.getValue(), entry.getKey());
System.out.print(entry.getValue().getObjectValue(TestingConnectorSession.SESSION, block, 0));
System.out.print(",");
}
System.out.println();
return batchSize;
}
}
@galabing
Copy link

galabing commented Jun 9, 2016

Hi Dain,

Thanks for the example code. It's been very helpful for me to start playing with ORC.

I wonder if you can further explain how to get lazy materialization to work? I adapted the above code into a simple example below. After I read the blocks, I found their class to be FixedWidthBlock, instead of LazyFixedWidthBlock. IIUC it means blocks are not lazily loaded. Do you know how I can somehow turn on lazy materialization?

Thanks.

import static com.facebook.presto.spi.type.BigintType.BIGINT;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;

import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcFile;
import org.apache.hadoop.hive.ql.io.orc.Writer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory.ObjectInspectorOptions;
import org.joda.time.DateTimeZone;

import com.facebook.presto.orc.FileOrcDataSource;
import com.facebook.presto.orc.OrcDataSource;
import com.facebook.presto.orc.OrcReader;
import com.facebook.presto.orc.OrcRecordReader;
import com.facebook.presto.orc.TupleDomainOrcPredicate;
import com.facebook.presto.orc.memory.AggregatedMemoryContext;
import com.facebook.presto.orc.metadata.OrcMetadataReader;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.type.Type;
import com.google.common.collect.ImmutableMap;

import io.airlift.units.DataSize;
import io.airlift.units.DataSize.Unit;

public class TestOrc {

private static final String ORC_FILE = "/tmp/test.orc";

private static class TestStruct {
public final long key;
public final double value;
public TestStruct(long key, double value) {
this.key = key;
this.value = value;
}
}

private static void write() throws IOException {
ObjectInspector inspector = ObjectInspectorFactory
.getReflectionObjectInspector(
TestStruct.class,
ObjectInspectorOptions.JAVA);
OrcFile.WriterOptions options = OrcFile.writerOptions(new Configuration())
.inspector(inspector);
Writer writer = OrcFile.createWriter(new Path(ORC_FILE), options);

for (int i = 0; i < 10000; ++i) {
  writer.addRow(new TestStruct(i, i * 2));
}
writer.close();

}

private static void read() throws IOException {
OrcDataSource source = new FileOrcDataSource(
new File(ORC_FILE),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));
OrcReader reader = new OrcReader(
source,
new OrcMetadataReader(),
new DataSize(1, Unit.MEGABYTE),
new DataSize(8, Unit.MEGABYTE));

Map<Integer, Type> columns = ImmutableMap.<Integer, Type>builder()
    .put(0, BIGINT)
    .put(1, DOUBLE)
    .build();
OrcRecordReader recordReader = reader.createRecordReader(
    columns, TupleDomainOrcPredicate.TRUE, DateTimeZone.UTC,
    new AggregatedMemoryContext());

long rows = 0;
for (int batchSize = recordReader.nextBatch(); batchSize > 0;
    batchSize = recordReader.nextBatch()) {
  rows += batchSize;
  for (Entry<Integer, Type> entry : columns.entrySet()) {
    Block block = recordReader.readBlock(entry.getValue(), entry.getKey());
    System.out.println(block.getClass());
  }
}
System.out.println(rows);

}

public static void main(String[] args) throws IOException {
write();
read();
}

}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment