knapply/json-benching.Rmd

## json-benching.md

      
    Raw
  

              json-benching.md
            
          
    JSON Benching

Setup Python

library(reticulate)
library(RcppSimdJson)
py_install(packages = "pysimdjson", pip = TRUE)
Get Data

json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"

temp_file <- tempfile(fileext = ".json")
download.file(json_url, temp_file)
R

load_and_unlist <- function(file_path, query = "Items") {
  init <- fload(file_path, query = query)
  init[] <- lapply(init, unlist, use.names = FALSE)
  init
}
Python

import simdjson
import pandas as pd

def load_and_deserialize(file_path):
  parser = simdjson.Parser()
  return parser.load(file_path).at_pointer("/Items").as_list()

load_and_deserialize(r.temp_file)[0]
## {'what': {'S': 'HOUSING'}, 'mobile': {'S': 'false'}, 'rating': {'N': '5'}, 'client': {'S': '258acb41-5dc2-4d74-9213-3045c17fb5ec'}, 'timestamp': {'N': '1601452859025'}, 'sourceIp': {'S': '---'}, 'tag': {'S': 'debate-20200930'}, 'id': {'S': '37eff5a8-a9c1-42b4-af4a-ee11096a95df'}, 'who': {'S': 'Judith Collins'}}

def load_deserialize_and_normalize(file_path):
  parser = simdjson.Parser()
  parsed = parser.load(file_path).at_pointer("/Items").as_list()
  return pd.json_normalize(parsed, max_level=1)

py_df = load_deserialize_and_normalize(r.temp_file)
py_df
##              what.S  ...           who.S
## 0           HOUSING  ...  Judith Collins
## 1           ECONOMY  ...  Jacinda Ardern
## 2            HEALTH  ...  Jacinda Ardern
## 3            HEALTH  ...  Judith Collins
## 4           ECONOMY  ...  Jacinda Ardern
## ..              ...  ...             ...
## 995       EDUCATION  ...  Judith Collins
## 996  COVID RESPONSE  ...  Judith Collins
## 997          HEALTH  ...  Judith Collins
## 998          HEALTH  ...  Jacinda Ardern
## 999  COVID RESPONSE  ...  Jacinda Ardern
## 
## [1000 rows x 9 columns]

Send py_df to R

py_df <- py$py_df
tibble::as_tibble(py_df) # {tibble}'s just to pretty print
## # A tibble: 1,000 x 9
##    what.S    mobile.S rating.N client.S            timestamp.N  sourceIp.S tag.S     id.S             who.S   
##    <chr>     <chr>    <chr>    <chr>               <chr>        <chr>      <chr>     <chr>            <chr>   
##  1 HOUSING   false    5        258acb41-5dc2-4d74… 16014528590… ---        debate-2… 37eff5a8-a9c1-4… Judith …
##  2 ECONOMY   false    1        4453930e-46df-4694… 16014513004… ---        debate-2… c4604411-6605-4… Jacinda…
##  3 HEALTH    false    2        e643ce11-6228-4f24… 16014551903… ---        debate-2… 1f3fdfb4-fed0-4… Jacinda…
##  4 HEALTH    false    5        8e5c1606-4c9d-40a3… 16014511616… ---        debate-2… d565df23-e6da-4… Judith …
##  5 ECONOMY   false    5        8af58b60-1241-43a8… 16014683275… ---        debate-2… 94a0a462-129e-4… Jacinda…
##  6 COVID RE… false    5        1d5b7748-4251-49f9… 16014558115… ---        debate-2… 86edfca2-16bd-4… Judith …
##  7 EDUCATION false    1        d5e79713-7c77-4f38… 16014961883… ---        debate-2… aa3fcd3f-b070-4… Judith …
##  8 COVID RE… false    3        744291c7-6a8c-4f9e… 16014759904… ---        debate-2… 6e390d17-9b20-4… Judith …
##  9 ECONOMY   false    5        34d9a2b2-d230-485d… 16014521415… ---        debate-2… c2db5910-5831-4… Judith …
## 10 HOUSING   false    1        b27f66be-93cf-4b4c… 16014576553… ---        debate-2… 15f9f1e2-c164-4… Jacinda…
## # … with 990 more rows

Benchmarks

Only Python

import timeit

temp_file = r.temp_file


# just load and deserialize JSON
timeit.Timer('load_and_deserialize(temp_file)', 
             setup="from __main__ import load_and_deserialize, temp_file") \
             .timeit(number=1) * 1000 # milliseconds
## 3.349987993715331

# load, deserialize, and "normalize" into a data frame
timeit.Timer('load_deserialize_and_normalize(temp_file)', 
             setup="from __main__ import load_deserialize_and_normalize, temp_file") \
             .timeit(number=1) * 1000
## 71.6570360091282

Combo

res <- microbenchmark::microbenchmark(
  R_clean = load_and_unlist(temp_file),
  R = fload(temp_file, query = "Items")
  ,
  Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
  Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
)

print(res, order = "median")
## Unit: milliseconds
##      expr       min        lq      mean    median        uq      max neval
##         R  2.924479  3.297000  3.898203  3.445144  3.733756 16.17030   100
##   R_clean  3.309015  3.640663  4.135215  3.763053  4.141021 16.65119   100
##        Py  3.860174  4.363253  5.287088  4.773239  5.513256 13.49291   100
##  Py_clean 62.174967 64.821467 67.286170 66.720669 69.029512 76.82542   100

print(res, order = "median", unit = "relative")
## Unit: relative
##      expr       min        lq      mean    median        uq       max neval
##         R  1.000000  1.000000  1.000000  1.000000  1.000000 1.0000000   100
##   R_clean  1.131489  1.104235  1.060800  1.092277  1.109076 1.0297392   100
##        Py  1.319953  1.323401  1.356289  1.385498  1.476598 0.8344259   100
##  Py_clean 21.260186 19.660742 17.260819 19.366584 18.487955 4.7510214   100

ggplot2::autoplot(res, log = TRUE)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.


Sanity Check

r_df <- load_and_unlist(temp_file)
py_df <- py$py_df

attributes(py_df) <- attributes(r_df)

stopifnot(
  identical(py_df, r_df)
)

  
## json-benching.Rmd
---
title: "JSON Benching"
output: github_document
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(width = 110)
```

## Setup Python

```{r}
library(reticulate)
library(RcppSimdJson)
```


```{r, eval=FALSE}
py_install(packages = "pysimdjson", pip = TRUE)
```

## Get Data

```{r}
json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"

temp_file <- tempfile(fileext = ".json")
download.file(json_url, temp_file)
```

## R

```{r}
load_and_unlist <- function(file_path, query = "Items") {
  init <- fload(file_path, query = query)
  init[] <- lapply(init, unlist, use.names = FALSE)
  init
}
```

## Python

```{python}
import simdjson
import pandas as pd

def load_and_deserialize(file_path):
  parser = simdjson.Parser()
  return parser.load(file_path).at_pointer("/Items").as_list()

load_and_deserialize(r.temp_file)[0]


def load_deserialize_and_normalize(file_path):
  parser = simdjson.Parser()
  parsed = parser.load(file_path).at_pointer("/Items").as_list()
  return pd.json_normalize(parsed, max_level=1)

py_df = load_deserialize_and_normalize(r.temp_file)
py_df
```

### Send `py_df` to R

```{r}
py_df <- py$py_df
tibble::as_tibble(py_df) # {tibble}'s just to pretty print
```

## Benchmarks

### Only Python

```{python}
import timeit

temp_file = r.temp_file


# just load and deserialize JSON
timeit.Timer('load_and_deserialize(temp_file)',
             setup="from __main__ import load_and_deserialize, temp_file") \
             .timeit(number=1) * 1000 # milliseconds
```

```{python}
# load, deserialize, and "normalize" into a data frame
timeit.Timer('load_deserialize_and_normalize(temp_file)',
             setup="from __main__ import load_deserialize_and_normalize, temp_file") \
             .timeit(number=1) * 1000
```

### Combo

```{r}
res <- microbenchmark::microbenchmark(
  R_clean = load_and_unlist(temp_file),
  R = fload(temp_file, query = "Items")
  ,
  Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
  Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
)

print(res, order = "median")
print(res, order = "median", unit = "relative")
ggplot2::autoplot(res, log = TRUE)
```

## Sanity Check

```{r}
r_df <- load_and_unlist(temp_file)
py_df <- py$py_df

attributes(py_df) <- attributes(r_df)

stopifnot(
  identical(py_df, r_df)
)
```
	---
	title: "JSON Benching"
	output: github_document
	editor_options:
	chunk_output_type: console
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE)
	options(width = 110)
	```

	## Setup Python

	```{r}
	library(reticulate)
	library(RcppSimdJson)
	```


	```{r, eval=FALSE}
	py_install(packages = "pysimdjson", pip = TRUE)
	```

	## Get Data

	```{r}
	json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"

	temp_file <- tempfile(fileext = ".json")
	download.file(json_url, temp_file)
	```

	## R

	```{r}
	load_and_unlist <- function(file_path, query = "Items") {
	init <- fload(file_path, query = query)
	init[] <- lapply(init, unlist, use.names = FALSE)
	init
	}
	```

	## Python

	```{python}
	import simdjson
	import pandas as pd

	def load_and_deserialize(file_path):
	parser = simdjson.Parser()
	return parser.load(file_path).at_pointer("/Items").as_list()

	load_and_deserialize(r.temp_file)[0]


	def load_deserialize_and_normalize(file_path):
	parser = simdjson.Parser()
	parsed = parser.load(file_path).at_pointer("/Items").as_list()
	return pd.json_normalize(parsed, max_level=1)

	py_df = load_deserialize_and_normalize(r.temp_file)
	py_df
	```

	### Send `py_df` to R

	```{r}
	py_df <- py$py_df
	tibble::as_tibble(py_df) # {tibble}'s just to pretty print
	```

	## Benchmarks

	### Only Python

	```{python}
	import timeit

	temp_file = r.temp_file


	# just load and deserialize JSON
	timeit.Timer('load_and_deserialize(temp_file)',
	setup="from __main__ import load_and_deserialize, temp_file") \
	.timeit(number=1) * 1000 # milliseconds
	```

	```{python}
	# load, deserialize, and "normalize" into a data frame
	timeit.Timer('load_deserialize_and_normalize(temp_file)',
	setup="from __main__ import load_deserialize_and_normalize, temp_file") \
	.timeit(number=1) * 1000
	```

	### Combo

	```{r}
	res <- microbenchmark::microbenchmark(
	R_clean = load_and_unlist(temp_file),
	R = fload(temp_file, query = "Items")
	,
	Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
	Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
	)

	print(res, order = "median")
	print(res, order = "median", unit = "relative")
	ggplot2::autoplot(res, log = TRUE)
	```

	## Sanity Check

	```{r}
	r_df <- load_and_unlist(temp_file)
	py_df <- py$py_df

	attributes(py_df) <- attributes(r_df)

	stopifnot(
	identical(py_df, r_df)
	)
	```