Skip to content

Instantly share code, notes, and snippets.

@ananis25
Created November 27, 2020 05:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ananis25/0b645ef94a70a0834fd23177e8721be9 to your computer and use it in GitHub Desktop.
Save ananis25/0b645ef94a70a0834fd23177e8721be9 to your computer and use it in GitHub Desktop.
adding metadata to arrow field using the C API
#include <stdlib.h>
#include <arrow-glib/arrow-glib.h>
int main(int argc, char **argv)
{
int success = EXIT_FAILURE;
GError *error = NULL;
GArrowMemoryMappedInputStream *input;
GArrowRecordBatchFileReader *reader;
const char *input_path = "ext.arrow";
input = garrow_memory_mapped_input_stream_new(input_path, &error);
if (!input) {
g_print("failed to open file: %s\n", error->message);
g_error_free(error);
goto cleanup;
}
reader = garrow_record_batch_file_reader_new(GARROW_SEEKABLE_INPUT_STREAM(input), &error);
if (!reader) {
g_print("failed to open file reader: %s\n", error->message);
g_error_free(error);
goto cleanup;
}
GArrowRecordBatch *record_batch = garrow_record_batch_file_reader_read_record_batch(reader, 0, &error);
if (!record_batch) {
g_print("failed to open file reader: %s\n", error->message);
g_error_free(error);
goto cleanup;
}
GArrowSchema *schema = garrow_record_batch_get_schema(record_batch);
gboolean true_val = 1;
g_print(garrow_schema_to_string_metadata(schema, true_val));
GHashTable * metadata = garrow_schema_get_metadata(schema);
g_print("size of schema metadata: %d", g_hash_table_size(metadata));
g_object_unref(record_batch);
success = EXIT_SUCCESS;
goto cleanup;
cleanup:
g_object_unref(reader);
g_object_unref(input);
return success;
}
import pyarrow as pa
import pandas as pd
import numpy as np
# copied from pyarrow tests
class PeriodType(pa.ExtensionType):
def __init__(self, freq):
self._freq = freq
pa.ExtensionType.__init__(self, pa.int64(), 'test.period')
@property
def freq(self):
return self._freq
def __arrow_ext_serialize__(self):
return "freq={}".format(self.freq).encode()
@classmethod
def __arrow_ext_deserialize__(cls, storage_type, serialized):
serialized = serialized.decode()
assert serialized.startswith("freq=")
freq = serialized.split('=')[1]
return PeriodType(freq)
period_type = PeriodType('D')
storage = pa.array(np.arange(1000000), pa.int64())
arr = pa.ExtensionArray.from_storage(period_type, storage)
batch = pa.RecordBatch.from_arrays([arr,], ["ext"])
table = pa.Table.from_batches([batch])
filename = "ext.arrow"
stream = pa.OSFile(filename, "wb")
writer = pa.ipc.new_file(stream, table.schema)
writer.write_table(table)
writer.close()
stream.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment