Skip to content

Instantly share code, notes, and snippets.

Created January 31, 2022 06:01
Show Gist options
  • Save h7kanna/211719590a5c9ed11c8a5535d2d5da6d to your computer and use it in GitHub Desktop.
Save h7kanna/211719590a5c9ed11c8a5535d2d5da6d to your computer and use it in GitHub Desktop.
Arrow Parquet and Go
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
const usage = `Parquet Reader.
parquet_reader -h | --help
parquet_reader [--only-metadata] [--no-memory-map] [--json]
[--print-key-value-metadata] [--columns=COLUMNS] <file>
-h --help Show this screen.
--print-key-value-metadata Print out the key-value metadata [default: false]
--only-metadata Stop after printing metadata, no values.
--no-memory-map Disable memory mapping the file.
--json Format output as JSON instead of text.
--columns=COLUMNS Specify a subset of columns to print, comma delimited indexes.`
func main() {
opts, _ := docopt.ParseDoc(usage)
var config struct {
PrintKeyValueMetadata bool
OnlyMetadata bool
NoMemoryMap bool
JSON bool `docopt:"--json"`
Columns string
File string
if config.JSON {
fmt.Fprintln(os.Stderr, "error: json output not implemented yet! falling back to regular")
selectedColumns := []int{}
if config.Columns != "" {
for _, c := range strings.Split(config.Columns, ",") {
cval, err := strconv.Atoi(c)
if err != nil {
fmt.Fprintln(os.Stderr, "error: --columns needs to be comma-delimited integers")
selectedColumns = append(selectedColumns, cval)
rdr, err := file.OpenParquetFile(config.File, !config.NoMemoryMap, nil, nil)
if err != nil {
fmt.Fprintln(os.Stderr, "error opening parquet file: ", err)
fileMetadata := rdr.MetaData()
fmt.Println("File name:", config.File)
fmt.Println("Version:", fileMetadata.Version())
fmt.Println("Created By:", fileMetadata.GetCreatedBy())
fmt.Println("Num Rows:", rdr.NumRows())
keyvaluemeta := fileMetadata.KeyValueMetadata()
if config.PrintKeyValueMetadata && keyvaluemeta != nil {
fmt.Println("Key Value File Metadata:", keyvaluemeta.Len(), "entries")
keys := keyvaluemeta.Keys()
values := keyvaluemeta.Values()
for i := 0; i < keyvaluemeta.Len(); i++ {
fmt.Printf("Key nr %d %s: %s\n", i, keys[i], values[i])
fmt.Println("Number of RowGroups:", rdr.NumRowGroups())
fmt.Println("Number of Real Columns:", fileMetadata.Schema.Root().NumFields())
fmt.Println("Number of Columns:", fileMetadata.Schema.NumColumns())
if len(selectedColumns) == 0 {
for i := 0; i < fileMetadata.Schema.NumColumns(); i++ {
selectedColumns = append(selectedColumns, i)
} else {
for _, c := range selectedColumns {
if c < 0 || c >= fileMetadata.Schema.NumColumns() {
fmt.Fprintln(os.Stderr, "selected column is out of range")
fmt.Println("Number of Selected Columns:", len(selectedColumns))
for _, c := range selectedColumns {
descr := fileMetadata.Schema.Column(c)
fmt.Printf("Column %d: %s (%s", c, descr.Path(), descr.PhysicalType())
if descr.ConvertedType() != schema.ConvertedTypes.None {
fmt.Printf("/%s", descr.ConvertedType())
if descr.ConvertedType() == schema.ConvertedTypes.Decimal {
dec := descr.LogicalType().(*schema.DecimalLogicalType)
fmt.Printf("(%d,%d)", dec.Precision(), dec.Scale())
for r := 0; r < rdr.NumRowGroups(); r++ {
fmt.Println("--- Row Group:", r, " ---")
rgr := rdr.RowGroup(r)
rowGroupMeta := rgr.MetaData()
fmt.Println("--- Total Bytes:", rowGroupMeta.TotalByteSize(), " ---")
fmt.Println("--- Rows:", rgr.NumRows(), " ---")
for _, c := range selectedColumns {
chunkMeta, err := rowGroupMeta.ColumnChunk(c)
if err != nil {
fmt.Println("Column", c)
if set, _ := chunkMeta.StatsSet(); set {
stats, err := chunkMeta.Statistics()
if err != nil {
fmt.Printf(" Values: %d", chunkMeta.NumValues())
if stats.HasMinMax() {
fmt.Printf(", Min: %v, Max: %v",
metadata.GetStatValue(stats.Type(), stats.EncodeMin()),
metadata.GetStatValue(stats.Type(), stats.EncodeMax()))
if stats.HasNullCount() {
fmt.Printf(", Null Values: %d", stats.NullCount())
if stats.HasDistinctCount() {
fmt.Printf(", Distinct Values: %d", stats.DistinctCount())
} else {
fmt.Println(" Values:", chunkMeta.NumValues(), "Statistics Not Set")
fmt.Print(" Compression: ", chunkMeta.Compression())
fmt.Print(", Encodings:")
for _, enc := range chunkMeta.Encodings() {
fmt.Print(" ", enc)
fmt.Print(" Uncompressed Size: ", chunkMeta.TotalUncompressedSize())
fmt.Println(", Compressed Size:", chunkMeta.TotalCompressedSize())
if config.OnlyMetadata {
fmt.Println("--- Values ---")
const colwidth = 1
scanners := make([]*Dumper, len(selectedColumns))
for idx, c := range selectedColumns {
scanners[idx] = createDumper(rgr.Column(c))
fmt.Printf(fmt.Sprintf("%%-%ds|", colwidth), rgr.Column(c).Descriptor().Name())
var b bytes.Buffer
for {
data := false
for _, s := range scanners {
if val, ok := s.Next(); ok {
fmt.Fprint(&b, s.FormatValue(val, colwidth), ",")
data = true
} else {
fmt.Fprint(&b, s.FormatValue(val, colwidth), "")
fmt.Fprint(&b, "\n")
if !data {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment