Skip to content

Instantly share code, notes, and snippets.

@darionyaphet
Last active May 15, 2017 03:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save darionyaphet/73204695efdfa1885416eb464d76cabf to your computer and use it in GitHub Desktop.
Save darionyaphet/73204695efdfa1885416eb464d76cabf to your computer and use it in GitHub Desktop.
Parquet Example
package org.darion.yaphet.parquet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import java.io.IOException;
public class ReadFileExample {
public static void main(String[] args) throws IOException {
Configuration configuration = new Configuration();
Path parquetFile = new Path("/tmp/PARQUET-1M-BS256M_PS4M");
ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
for (int i = 0; i < 1024 * 1024 * 4; i++) {
Group group = reader.read();
// blackhole.consume(group.getBinary("binary_field", 0));
// blackhole.consume(group.getInteger("int32_field", 0));
// blackhole.consume(group.getLong("int64_field", 0));
// blackhole.consume(group.getBoolean("boolean_field", 0));
// blackhole.consume(group.getFloat("float_field", 0));
// blackhole.consume(group.getDouble("double_field", 0));
// blackhole.consume(group.getBinary("flba_field", 0));
// blackhole.consume(group.getInt96("int96_field", 0));
System.out.println(group.getInteger("int32_field", 0));
}
reader.close();
}
}
package org.darion.yaphet.parquet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0;
import static org.apache.parquet.schema.MessageTypeParser.parseMessageType;
public class WriteFileExample {
public static void main(String[] args) throws IOException {
MessageType schema = parseMessageType(
"message test { "
+ "required binary binary_field; "
+ "required int32 int32_field; "
+ "required int64 int64_field; "
+ "required boolean boolean_field; "
+ "required float float_field; "
+ "required double double_field; "
+ "required fixed_len_byte_array(64) flba_field; "
+ "required int96 int96_field; "
+ "} ");
Configuration configuration = new Configuration();
GroupWriteSupport.setSchema(schema, configuration);
SimpleGroupFactory f = new SimpleGroupFactory(schema);
final int blockSize = 256 * 1024 * 1024;
final int pageSie = 4 * 1024 * 1024;
ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path("/tmp/parquet.out"),
new GroupWriteSupport(), CompressionCodecName.UNCOMPRESSED, blockSize,
pageSie, blockSize, true, false, PARQUET_2_0, configuration);
final int rowSize = 1024 * 1024 * 128;
for (int i = 0; i < rowSize; i++) {
writer.write(f.newGroup()
.append("binary_field", i + "")
.append("int32_field", i)
.append("int64_field", 64l)
.append("boolean_field", true)
.append("float_field", 1.0f)
.append("double_field", 2.0d)
.append("flba_field", i + "")
.append("int96_field", Binary.fromConstantByteArray(new byte[12]))
);
}
writer.close();
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.darion.yaphet</groupId>
<artifactId>parquet</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.4</version>
</dependency>
</dependencies>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment