Skip to content

Instantly share code, notes, and snippets.

@DavisVaughan
Last active April 28, 2018 17:01
Show Gist options
  • Save DavisVaughan/fbef89e5b511b25d4698813a6ee0eb30 to your computer and use it in GitHub Desktop.
Save DavisVaughan/fbef89e5b511b25d4698813a6ee0eb30 to your computer and use it in GitHub Desktop.
Exploring Apache Arrow with pyarrow and reticulate
library(reticulate)
pa <- import("pyarrow", convert = FALSE)
# Create a list of pyarrow arrays
# this is our data, each array will become a column, but isn't quite one yet
# notice it doesn't even have a column name. they are kind of like vectors
data <- list(
pa$array(list(1,2,3,4)),
pa$array(list("foo", "bar", "baz", "hi")),
pa$array(list(TRUE, TRUE, FALSE, TRUE))
)
# Create a Record Batch from these arrays
# Its kind of like a table, but not quite.
# Notice we also add column names at this point! Yay metadata
batch <- pa$RecordBatch$from_arrays(data, list("f0", "f1", "f2"))
# This is structured like a table, and knows about cols and rows
batch$num_columns
#> 3
batch$num_rows
#> 4
# Make 5 of them so we can test making 1 Table from multiple Record Batches
batches <- list(batch, batch, batch, batch, batch)
# Convert the 5 Record Batches into 1 Arrow Table
table <- pa$Table$from_batches(batches)
# The print method here shows the Schema for the Table
# (the meta data for the table column names and types)
table
#> pyarrow.Table
#> f0: double
#> f1: string
#> f2: bool
class(table)
#> [1] "pyarrow.lib.Table" "python.builtin.object"
# Now convert the Arrow Table -> Pandas DataFrame
table_pd <- table$to_pandas()
table_pd
#> f0 f1 f2
#> 0 1.0 foo True
#> 1 2.0 bar True
#> 2 3.0 baz False
#> 3 4.0 hi True
#> 4 1.0 foo True
#> 5 2.0 bar True
#> 6 3.0 baz False
#> 7 4.0 hi True
#> 8 1.0 foo True
#> 9 2.0 bar True
#> 10 3.0 baz False
#> 11 4.0 hi True
#> 12 1.0 foo True
#> 13 2.0 bar True
#> 14 3.0 baz False
#> 15 4.0 hi True
#> 16 1.0 foo True
#> 17 2.0 bar True
#> 18 3.0 baz False
#> 19 4.0 hi True
class(table_pd)
#> [1] "pandas.core.frame.DataFrame"
#> [2] "pandas.core.generic.NDFrame"
#> [3] "pandas.core.base.PandasObject"
#> [4] "pandas.core.base.StringMixin"
#> [5] "pandas.core.accessor.DirNamesMixin"
#> [6] "pandas.core.base.SelectionMixin"
#> [7] "python.builtin.object"
# reticulate knows how to turn a Pandas DF into an R data.frame!
table_df <- py_to_r(table_pd)
table_df
#> f0 f1 f2
#> 1 1 foo TRUE
#> 2 2 bar TRUE
#> 3 3 baz FALSE
#> 4 4 hi TRUE
#> 5 1 foo TRUE
#> 6 2 bar TRUE
#> 7 3 baz FALSE
#> 8 4 hi TRUE
#> 9 1 foo TRUE
#> 10 2 bar TRUE
#> 11 3 baz FALSE
#> 12 4 hi TRUE
#> 13 1 foo TRUE
#> 14 2 bar TRUE
#> 15 3 baz FALSE
#> 16 4 hi TRUE
#> 17 1 foo TRUE
#> 18 2 bar TRUE
#> 19 3 baz FALSE
#> 20 4 hi TRUE
class(table_df)
#> [1] "data.frame"
#' Created on 2018-04-19 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment