Last active
April 28, 2018 17:01
-
-
Save DavisVaughan/fbef89e5b511b25d4698813a6ee0eb30 to your computer and use it in GitHub Desktop.
Exploring Apache Arrow with pyarrow and reticulate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reticulate) | |
pa <- import("pyarrow", convert = FALSE) | |
# Create a list of pyarrow arrays | |
# this is our data, each array will become a column, but isn't quite one yet | |
# notice it doesn't even have a column name. they are kind of like vectors | |
data <- list( | |
pa$array(list(1,2,3,4)), | |
pa$array(list("foo", "bar", "baz", "hi")), | |
pa$array(list(TRUE, TRUE, FALSE, TRUE)) | |
) | |
# Create a Record Batch from these arrays | |
# Its kind of like a table, but not quite. | |
# Notice we also add column names at this point! Yay metadata | |
batch <- pa$RecordBatch$from_arrays(data, list("f0", "f1", "f2")) | |
# This is structured like a table, and knows about cols and rows | |
batch$num_columns | |
#> 3 | |
batch$num_rows | |
#> 4 | |
# Make 5 of them so we can test making 1 Table from multiple Record Batches | |
batches <- list(batch, batch, batch, batch, batch) | |
# Convert the 5 Record Batches into 1 Arrow Table | |
table <- pa$Table$from_batches(batches) | |
# The print method here shows the Schema for the Table | |
# (the meta data for the table column names and types) | |
table | |
#> pyarrow.Table | |
#> f0: double | |
#> f1: string | |
#> f2: bool | |
class(table) | |
#> [1] "pyarrow.lib.Table" "python.builtin.object" | |
# Now convert the Arrow Table -> Pandas DataFrame | |
table_pd <- table$to_pandas() | |
table_pd | |
#> f0 f1 f2 | |
#> 0 1.0 foo True | |
#> 1 2.0 bar True | |
#> 2 3.0 baz False | |
#> 3 4.0 hi True | |
#> 4 1.0 foo True | |
#> 5 2.0 bar True | |
#> 6 3.0 baz False | |
#> 7 4.0 hi True | |
#> 8 1.0 foo True | |
#> 9 2.0 bar True | |
#> 10 3.0 baz False | |
#> 11 4.0 hi True | |
#> 12 1.0 foo True | |
#> 13 2.0 bar True | |
#> 14 3.0 baz False | |
#> 15 4.0 hi True | |
#> 16 1.0 foo True | |
#> 17 2.0 bar True | |
#> 18 3.0 baz False | |
#> 19 4.0 hi True | |
class(table_pd) | |
#> [1] "pandas.core.frame.DataFrame" | |
#> [2] "pandas.core.generic.NDFrame" | |
#> [3] "pandas.core.base.PandasObject" | |
#> [4] "pandas.core.base.StringMixin" | |
#> [5] "pandas.core.accessor.DirNamesMixin" | |
#> [6] "pandas.core.base.SelectionMixin" | |
#> [7] "python.builtin.object" | |
# reticulate knows how to turn a Pandas DF into an R data.frame! | |
table_df <- py_to_r(table_pd) | |
table_df | |
#> f0 f1 f2 | |
#> 1 1 foo TRUE | |
#> 2 2 bar TRUE | |
#> 3 3 baz FALSE | |
#> 4 4 hi TRUE | |
#> 5 1 foo TRUE | |
#> 6 2 bar TRUE | |
#> 7 3 baz FALSE | |
#> 8 4 hi TRUE | |
#> 9 1 foo TRUE | |
#> 10 2 bar TRUE | |
#> 11 3 baz FALSE | |
#> 12 4 hi TRUE | |
#> 13 1 foo TRUE | |
#> 14 2 bar TRUE | |
#> 15 3 baz FALSE | |
#> 16 4 hi TRUE | |
#> 17 1 foo TRUE | |
#> 18 2 bar TRUE | |
#> 19 3 baz FALSE | |
#> 20 4 hi TRUE | |
class(table_df) | |
#> [1] "data.frame" | |
#' Created on 2018-04-19 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment