DavisVaughan/pyarrow-python-r.r

## pyarrow-python-r.r
library(reticulate)
pa <- import("pyarrow", convert = FALSE)

# Create a list of pyarrow arrays
# this is our data, each array will become a column, but isn't quite one yet
# notice it doesn't even have a column name. they are kind of like vectors
data <- list(
  pa$array(list(1,2,3,4)),
  pa$array(list("foo", "bar", "baz", "hi")),
  pa$array(list(TRUE, TRUE, FALSE, TRUE))
)

# Create a Record Batch from these arrays
# Its kind of like a table, but not quite.
# Notice we also add column names at this point! Yay metadata
batch <- pa$RecordBatch$from_arrays(data, list("f0", "f1", "f2"))

# This is structured like a table, and knows about cols and rows
batch$num_columns
#> 3
batch$num_rows
#> 4

# Make 5 of them so we can test making 1 Table from multiple Record Batches
batches <- list(batch, batch, batch, batch, batch)

# Convert the 5 Record Batches into 1 Arrow Table
table <- pa$Table$from_batches(batches)

# The print method here shows the Schema for the Table
# (the meta data for the table column names and types)
table
#> pyarrow.Table
#> f0: double
#> f1: string
#> f2: bool


class(table)
#> [1] "pyarrow.lib.Table"     "python.builtin.object"

# Now convert the Arrow Table -> Pandas DataFrame
table_pd <- table$to_pandas()

table_pd
#>      f0   f1     f2
#> 0   1.0  foo   True
#> 1   2.0  bar   True
#> 2   3.0  baz  False
#> 3   4.0   hi   True
#> 4   1.0  foo   True
#> 5   2.0  bar   True
#> 6   3.0  baz  False
#> 7   4.0   hi   True
#> 8   1.0  foo   True
#> 9   2.0  bar   True
#> 10  3.0  baz  False
#> 11  4.0   hi   True
#> 12  1.0  foo   True
#> 13  2.0  bar   True
#> 14  3.0  baz  False
#> 15  4.0   hi   True
#> 16  1.0  foo   True
#> 17  2.0  bar   True
#> 18  3.0  baz  False
#> 19  4.0   hi   True


class(table_pd)
#> [1] "pandas.core.frame.DataFrame"
#> [2] "pandas.core.generic.NDFrame"
#> [3] "pandas.core.base.PandasObject"
#> [4] "pandas.core.base.StringMixin"
#> [5] "pandas.core.accessor.DirNamesMixin"
#> [6] "pandas.core.base.SelectionMixin"
#> [7] "python.builtin.object"


# reticulate knows how to turn a Pandas DF into an R data.frame!
table_df <- py_to_r(table_pd)

table_df
#>    f0  f1    f2
#> 1   1 foo  TRUE
#> 2   2 bar  TRUE
#> 3   3 baz FALSE
#> 4   4  hi  TRUE
#> 5   1 foo  TRUE
#> 6   2 bar  TRUE
#> 7   3 baz FALSE
#> 8   4  hi  TRUE
#> 9   1 foo  TRUE
#> 10  2 bar  TRUE
#> 11  3 baz FALSE
#> 12  4  hi  TRUE
#> 13  1 foo  TRUE
#> 14  2 bar  TRUE
#> 15  3 baz FALSE
#> 16  4  hi  TRUE
#> 17  1 foo  TRUE
#> 18  2 bar  TRUE
#> 19  3 baz FALSE
#> 20  4  hi  TRUE

class(table_df)
#> [1] "data.frame"

#' Created on 2018-04-19 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
	library(reticulate)
	pa <- import("pyarrow", convert = FALSE)

	# Create a list of pyarrow arrays
	# this is our data, each array will become a column, but isn't quite one yet
	# notice it doesn't even have a column name. they are kind of like vectors
	data <- list(
	pa$array(list(1,2,3,4)),
	pa$array(list("foo", "bar", "baz", "hi")),
	pa$array(list(TRUE, TRUE, FALSE, TRUE))
	)

	# Create a Record Batch from these arrays
	# Its kind of like a table, but not quite.
	# Notice we also add column names at this point! Yay metadata
	batch <- pa$RecordBatch$from_arrays(data, list("f0", "f1", "f2"))

	# This is structured like a table, and knows about cols and rows
	batch$num_columns
	#> 3
	batch$num_rows
	#> 4

	# Make 5 of them so we can test making 1 Table from multiple Record Batches
	batches <- list(batch, batch, batch, batch, batch)

	# Convert the 5 Record Batches into 1 Arrow Table
	table <- pa$Table$from_batches(batches)

	# The print method here shows the Schema for the Table
	# (the meta data for the table column names and types)
	table
	#> pyarrow.Table
	#> f0: double
	#> f1: string
	#> f2: bool


	class(table)
	#> [1] "pyarrow.lib.Table" "python.builtin.object"

	# Now convert the Arrow Table -> Pandas DataFrame
	table_pd <- table$to_pandas()

	table_pd
	#> f0 f1 f2
	#> 0 1.0 foo True
	#> 1 2.0 bar True
	#> 2 3.0 baz False
	#> 3 4.0 hi True
	#> 4 1.0 foo True
	#> 5 2.0 bar True
	#> 6 3.0 baz False
	#> 7 4.0 hi True
	#> 8 1.0 foo True
	#> 9 2.0 bar True
	#> 10 3.0 baz False
	#> 11 4.0 hi True
	#> 12 1.0 foo True
	#> 13 2.0 bar True
	#> 14 3.0 baz False
	#> 15 4.0 hi True
	#> 16 1.0 foo True
	#> 17 2.0 bar True
	#> 18 3.0 baz False
	#> 19 4.0 hi True


	class(table_pd)
	#> [1] "pandas.core.frame.DataFrame"
	#> [2] "pandas.core.generic.NDFrame"
	#> [3] "pandas.core.base.PandasObject"
	#> [4] "pandas.core.base.StringMixin"
	#> [5] "pandas.core.accessor.DirNamesMixin"
	#> [6] "pandas.core.base.SelectionMixin"
	#> [7] "python.builtin.object"


	# reticulate knows how to turn a Pandas DF into an R data.frame!
	table_df <- py_to_r(table_pd)

	table_df
	#> f0 f1 f2
	#> 1 1 foo TRUE
	#> 2 2 bar TRUE
	#> 3 3 baz FALSE
	#> 4 4 hi TRUE
	#> 5 1 foo TRUE
	#> 6 2 bar TRUE
	#> 7 3 baz FALSE
	#> 8 4 hi TRUE
	#> 9 1 foo TRUE
	#> 10 2 bar TRUE
	#> 11 3 baz FALSE
	#> 12 4 hi TRUE
	#> 13 1 foo TRUE
	#> 14 2 bar TRUE
	#> 15 3 baz FALSE
	#> 16 4 hi TRUE
	#> 17 1 foo TRUE
	#> 18 2 bar TRUE
	#> 19 3 baz FALSE
	#> 20 4 hi TRUE

	class(table_df)
	#> [1] "data.frame"

	#' Created on 2018-04-19 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).