Skip to content

Instantly share code, notes, and snippets.

@hatemhelal
Created June 14, 2019 12:49
Show Gist options
  • Save hatemhelal/892e76a48e5b372f0e28a34403893ddd to your computer and use it in GitHub Desktop.
Save hatemhelal/892e76a48e5b372f0e28a34403893ddd to your computer and use it in GitHub Desktop.
Patched version of arrow/cpp/examples/parquet/parquet-arrow/reader-writer.cc
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <iostream>
#include <numeric>
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>
#include <parquet/exception.h>
#include <parquet/file_reader.h>
// #0 Build dummy data to pass around
// To have some input data, we first create an Arrow Table that holds
// some data.
std::shared_ptr<arrow::Table> generate_table() {
arrow::Int64Builder i64builder;
PARQUET_THROW_NOT_OK(i64builder.AppendValues({1, 2, 3, 4, 5}));
std::shared_ptr<arrow::Array> i64array;
PARQUET_THROW_NOT_OK(i64builder.Finish(&i64array));
arrow::StringBuilder strbuilder;
PARQUET_THROW_NOT_OK(strbuilder.Append("some"));
PARQUET_THROW_NOT_OK(strbuilder.Append("string"));
PARQUET_THROW_NOT_OK(strbuilder.Append("content"));
PARQUET_THROW_NOT_OK(strbuilder.Append("in"));
PARQUET_THROW_NOT_OK(strbuilder.Append("rows"));
std::shared_ptr<arrow::Array> strarray;
PARQUET_THROW_NOT_OK(strbuilder.Finish(&strarray));
std::shared_ptr<arrow::Schema> schema = arrow::schema(
{arrow::field("int", arrow::int64()), arrow::field("str", arrow::utf8())});
return arrow::Table::Make(schema, {i64array, strarray});
}
// #1 Write out the data as a Parquet file
void write_parquet_file(const arrow::Table& table) {
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_THROW_NOT_OK(
arrow::io::FileOutputStream::Open("parquet-arrow-example.parquet", &outfile));
// The last argument to the function call is the size of the RowGroup in
// the parquet file. Normally you would choose this to be rather large but
// for the example, we use a small value to have multiple RowGroups.
PARQUET_THROW_NOT_OK(
parquet::arrow::WriteTable(table, arrow::default_memory_pool(), outfile, 3));
}
// #2: Fully read in the file
void read_whole_file() {
std::cout << "Reading parquet-arrow-example.parquet at once" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Table> table;
PARQUET_THROW_NOT_OK(reader->ReadTable(&table));
std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
<< " columns." << std::endl;
}
// #3: Read only a single RowGroup of the parquet file
void read_single_rowgroup() {
std::cout << "Reading first RowGroup of parquet-arrow-example.parquet" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::Table> table;
PARQUET_THROW_NOT_OK(reader->RowGroup(0)->ReadTable(&table));
std::cout << "Loaded " << table->num_rows() << " rows in " << table->num_columns()
<< " columns." << std::endl;
}
// #4: Read only a single column of the whole parquet file
void read_single_column() {
std::cout << "Reading first column of parquet-arrow-example.parquet" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::ChunkedArray> array;
PARQUET_THROW_NOT_OK(reader->ReadColumn(0, &array));
PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
std::cout << std::endl;
}
// #5: Read only a single column of a RowGroup (this is known as ColumnChunk)
// from the Parquet file.
void read_single_column_chunk() {
std::cout << "Reading first ColumnChunk of the first RowGroup of "
"parquet-arrow-example.parquet"
<< std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open(
"parquet-arrow-example.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(
parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::shared_ptr<arrow::ChunkedArray> array;
PARQUET_THROW_NOT_OK(reader->RowGroup(0)->Column(0)->Read(&array));
PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
std::cout << std::endl;
}
void read_column_iterative() {
int num_rows = 100;
std::vector<int64_t> values(num_rows);
std::iota(values.begin(), values.end(), 0);
arrow::Int64Builder builder;
PARQUET_THROW_NOT_OK(builder.AppendValues(values));
std::shared_ptr<arrow::Array> array;
PARQUET_THROW_NOT_OK(builder.Finish(&array));
auto schema = arrow::schema({arrow::field("int64", arrow::int64())});
auto table = arrow::Table::Make(schema, {array});
std::shared_ptr<arrow::io::FileOutputStream> outfile;
PARQUET_THROW_NOT_OK(
arrow::io::FileOutputStream::Open("experiment.parquet", &outfile));
PARQUET_THROW_NOT_OK(
parquet::arrow::WriteTable(*table, arrow::default_memory_pool(), outfile, num_rows));
std::cout << "Reading first column of experiment.parquet iteratively" << std::endl;
std::shared_ptr<arrow::io::ReadableFile> infile;
PARQUET_THROW_NOT_OK(arrow::io::ReadableFile::Open("experiment.parquet", arrow::default_memory_pool(), &infile));
std::unique_ptr<parquet::arrow::FileReader> reader;
PARQUET_THROW_NOT_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));
std::unique_ptr<parquet::arrow::ColumnReader> col_reader;
PARQUET_THROW_NOT_OK(reader->GetColumn(0, &col_reader));
for (int i = 0; i < num_rows; ++i) {
std::shared_ptr<arrow::ChunkedArray> array;
PARQUET_THROW_NOT_OK(col_reader->NextBatch(1, &array));
PARQUET_THROW_NOT_OK(arrow::PrettyPrint(*array, 4, &std::cout));
}
}
int main(int argc, char** argv) {
std::shared_ptr<arrow::Table> table = generate_table();
write_parquet_file(*table);
read_whole_file();
read_single_rowgroup();
read_single_column();
read_single_column_chunk();
read_column_iterative();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment