Skip to content

Instantly share code, notes, and snippets.

@knapply
Last active October 2, 2020 22:32
Show Gist options
  • Save knapply/6166872cc86806c0192e5d68582b8488 to your computer and use it in GitHub Desktop.
Save knapply/6166872cc86806c0192e5d68582b8488 to your computer and use it in GitHub Desktop.

JSON Benching

Setup Python

library(reticulate)
library(RcppSimdJson)
py_install(packages = "pysimdjson", pip = TRUE)

Get Data

json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"

temp_file <- tempfile(fileext = ".json")
download.file(json_url, temp_file)

R

load_and_unlist <- function(file_path, query = "Items") {
  init <- fload(file_path, query = query)
  init[] <- lapply(init, unlist, use.names = FALSE)
  init
}

Python

import simdjson
import pandas as pd

def load_and_deserialize(file_path):
  parser = simdjson.Parser()
  return parser.load(file_path).at_pointer("/Items").as_list()

load_and_deserialize(r.temp_file)[0]
## {'what': {'S': 'HOUSING'}, 'mobile': {'S': 'false'}, 'rating': {'N': '5'}, 'client': {'S': '258acb41-5dc2-4d74-9213-3045c17fb5ec'}, 'timestamp': {'N': '1601452859025'}, 'sourceIp': {'S': '---'}, 'tag': {'S': 'debate-20200930'}, 'id': {'S': '37eff5a8-a9c1-42b4-af4a-ee11096a95df'}, 'who': {'S': 'Judith Collins'}}
def load_deserialize_and_normalize(file_path):
  parser = simdjson.Parser()
  parsed = parser.load(file_path).at_pointer("/Items").as_list()
  return pd.json_normalize(parsed, max_level=1)

py_df = load_deserialize_and_normalize(r.temp_file)
py_df
##              what.S  ...           who.S
## 0           HOUSING  ...  Judith Collins
## 1           ECONOMY  ...  Jacinda Ardern
## 2            HEALTH  ...  Jacinda Ardern
## 3            HEALTH  ...  Judith Collins
## 4           ECONOMY  ...  Jacinda Ardern
## ..              ...  ...             ...
## 995       EDUCATION  ...  Judith Collins
## 996  COVID RESPONSE  ...  Judith Collins
## 997          HEALTH  ...  Judith Collins
## 998          HEALTH  ...  Jacinda Ardern
## 999  COVID RESPONSE  ...  Jacinda Ardern
## 
## [1000 rows x 9 columns]

Send py_df to R

py_df <- py$py_df
tibble::as_tibble(py_df) # {tibble}'s just to pretty print
## # A tibble: 1,000 x 9
##    what.S    mobile.S rating.N client.S            timestamp.N  sourceIp.S tag.S     id.S             who.S   
##    <chr>     <chr>    <chr>    <chr>               <chr>        <chr>      <chr>     <chr>            <chr>   
##  1 HOUSING   false    5        258acb41-5dc2-4d74… 16014528590… ---        debate-2… 37eff5a8-a9c1-4… Judith …
##  2 ECONOMY   false    1        4453930e-46df-4694… 16014513004… ---        debate-2… c4604411-6605-4… Jacinda…
##  3 HEALTH    false    2        e643ce11-6228-4f24… 16014551903… ---        debate-2… 1f3fdfb4-fed0-4… Jacinda…
##  4 HEALTH    false    5        8e5c1606-4c9d-40a3… 16014511616… ---        debate-2… d565df23-e6da-4… Judith …
##  5 ECONOMY   false    5        8af58b60-1241-43a8… 16014683275… ---        debate-2… 94a0a462-129e-4… Jacinda…
##  6 COVID RE… false    5        1d5b7748-4251-49f9… 16014558115… ---        debate-2… 86edfca2-16bd-4… Judith …
##  7 EDUCATION false    1        d5e79713-7c77-4f38… 16014961883… ---        debate-2… aa3fcd3f-b070-4… Judith …
##  8 COVID RE… false    3        744291c7-6a8c-4f9e… 16014759904… ---        debate-2… 6e390d17-9b20-4… Judith …
##  9 ECONOMY   false    5        34d9a2b2-d230-485d… 16014521415… ---        debate-2… c2db5910-5831-4… Judith …
## 10 HOUSING   false    1        b27f66be-93cf-4b4c… 16014576553… ---        debate-2… 15f9f1e2-c164-4… Jacinda…
## # … with 990 more rows

Benchmarks

Only Python

import timeit

temp_file = r.temp_file


# just load and deserialize JSON
timeit.Timer('load_and_deserialize(temp_file)', 
             setup="from __main__ import load_and_deserialize, temp_file") \
             .timeit(number=1) * 1000 # milliseconds
## 3.349987993715331
# load, deserialize, and "normalize" into a data frame
timeit.Timer('load_deserialize_and_normalize(temp_file)', 
             setup="from __main__ import load_deserialize_and_normalize, temp_file") \
             .timeit(number=1) * 1000
## 71.6570360091282

Combo

res <- microbenchmark::microbenchmark(
  R_clean = load_and_unlist(temp_file),
  R = fload(temp_file, query = "Items")
  ,
  Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
  Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
)

print(res, order = "median")
## Unit: milliseconds
##      expr       min        lq      mean    median        uq      max neval
##         R  2.924479  3.297000  3.898203  3.445144  3.733756 16.17030   100
##   R_clean  3.309015  3.640663  4.135215  3.763053  4.141021 16.65119   100
##        Py  3.860174  4.363253  5.287088  4.773239  5.513256 13.49291   100
##  Py_clean 62.174967 64.821467 67.286170 66.720669 69.029512 76.82542   100
print(res, order = "median", unit = "relative")
## Unit: relative
##      expr       min        lq      mean    median        uq       max neval
##         R  1.000000  1.000000  1.000000  1.000000  1.000000 1.0000000   100
##   R_clean  1.131489  1.104235  1.060800  1.092277  1.109076 1.0297392   100
##        Py  1.319953  1.323401  1.356289  1.385498  1.476598 0.8344259   100
##  Py_clean 21.260186 19.660742 17.260819 19.366584 18.487955 4.7510214   100
ggplot2::autoplot(res, log = TRUE)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Sanity Check

r_df <- load_and_unlist(temp_file)
py_df <- py$py_df

attributes(py_df) <- attributes(r_df)

stopifnot(
  identical(py_df, r_df)
)
---
title: "JSON Benching"
output: github_document
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
options(width = 110)
```
## Setup Python
```{r}
library(reticulate)
library(RcppSimdJson)
```
```{r, eval=FALSE}
py_install(packages = "pysimdjson", pip = TRUE)
```
## Get Data
```{r}
json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"
temp_file <- tempfile(fileext = ".json")
download.file(json_url, temp_file)
```
## R
```{r}
load_and_unlist <- function(file_path, query = "Items") {
init <- fload(file_path, query = query)
init[] <- lapply(init, unlist, use.names = FALSE)
init
}
```
## Python
```{python}
import simdjson
import pandas as pd
def load_and_deserialize(file_path):
parser = simdjson.Parser()
return parser.load(file_path).at_pointer("/Items").as_list()
load_and_deserialize(r.temp_file)[0]
def load_deserialize_and_normalize(file_path):
parser = simdjson.Parser()
parsed = parser.load(file_path).at_pointer("/Items").as_list()
return pd.json_normalize(parsed, max_level=1)
py_df = load_deserialize_and_normalize(r.temp_file)
py_df
```
### Send `py_df` to R
```{r}
py_df <- py$py_df
tibble::as_tibble(py_df) # {tibble}'s just to pretty print
```
## Benchmarks
### Only Python
```{python}
import timeit
temp_file = r.temp_file
# just load and deserialize JSON
timeit.Timer('load_and_deserialize(temp_file)',
setup="from __main__ import load_and_deserialize, temp_file") \
.timeit(number=1) * 1000 # milliseconds
```
```{python}
# load, deserialize, and "normalize" into a data frame
timeit.Timer('load_deserialize_and_normalize(temp_file)',
setup="from __main__ import load_deserialize_and_normalize, temp_file") \
.timeit(number=1) * 1000
```
### Combo
```{r}
res <- microbenchmark::microbenchmark(
R_clean = load_and_unlist(temp_file),
R = fload(temp_file, query = "Items")
,
Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
)
print(res, order = "median")
print(res, order = "median", unit = "relative")
ggplot2::autoplot(res, log = TRUE)
```
## Sanity Check
```{r}
r_df <- load_and_unlist(temp_file)
py_df <- py$py_df
attributes(py_df) <- attributes(r_df)
stopifnot(
identical(py_df, r_df)
)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment