library(reticulate)
library(RcppSimdJson)
py_install(packages = "pysimdjson", pip = TRUE)
json_url <- "https://gist.githubusercontent.com/vizowl/f9f2ef6c6221e28b103c66d7afc77985/raw/11b05a5cc921373d56f7d9b13b4f88f32aed3c4f/sample.json"
temp_file <- tempfile(fileext = ".json")
download.file(json_url, temp_file)
load_and_unlist <- function(file_path, query = "Items") {
init <- fload(file_path, query = query)
init[] <- lapply(init, unlist, use.names = FALSE)
init
}
import simdjson
import pandas as pd
def load_and_deserialize(file_path):
parser = simdjson.Parser()
return parser.load(file_path).at_pointer("/Items").as_list()
load_and_deserialize(r.temp_file)[0]
## {'what': {'S': 'HOUSING'}, 'mobile': {'S': 'false'}, 'rating': {'N': '5'}, 'client': {'S': '258acb41-5dc2-4d74-9213-3045c17fb5ec'}, 'timestamp': {'N': '1601452859025'}, 'sourceIp': {'S': '---'}, 'tag': {'S': 'debate-20200930'}, 'id': {'S': '37eff5a8-a9c1-42b4-af4a-ee11096a95df'}, 'who': {'S': 'Judith Collins'}}
def load_deserialize_and_normalize(file_path):
parser = simdjson.Parser()
parsed = parser.load(file_path).at_pointer("/Items").as_list()
return pd.json_normalize(parsed, max_level=1)
py_df = load_deserialize_and_normalize(r.temp_file)
py_df
## what.S ... who.S
## 0 HOUSING ... Judith Collins
## 1 ECONOMY ... Jacinda Ardern
## 2 HEALTH ... Jacinda Ardern
## 3 HEALTH ... Judith Collins
## 4 ECONOMY ... Jacinda Ardern
## .. ... ... ...
## 995 EDUCATION ... Judith Collins
## 996 COVID RESPONSE ... Judith Collins
## 997 HEALTH ... Judith Collins
## 998 HEALTH ... Jacinda Ardern
## 999 COVID RESPONSE ... Jacinda Ardern
##
## [1000 rows x 9 columns]
py_df <- py$py_df
tibble::as_tibble(py_df) # {tibble}'s just to pretty print
## # A tibble: 1,000 x 9
## what.S mobile.S rating.N client.S timestamp.N sourceIp.S tag.S id.S who.S
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 HOUSING false 5 258acb41-5dc2-4d74… 16014528590… --- debate-2… 37eff5a8-a9c1-4… Judith …
## 2 ECONOMY false 1 4453930e-46df-4694… 16014513004… --- debate-2… c4604411-6605-4… Jacinda…
## 3 HEALTH false 2 e643ce11-6228-4f24… 16014551903… --- debate-2… 1f3fdfb4-fed0-4… Jacinda…
## 4 HEALTH false 5 8e5c1606-4c9d-40a3… 16014511616… --- debate-2… d565df23-e6da-4… Judith …
## 5 ECONOMY false 5 8af58b60-1241-43a8… 16014683275… --- debate-2… 94a0a462-129e-4… Jacinda…
## 6 COVID RE… false 5 1d5b7748-4251-49f9… 16014558115… --- debate-2… 86edfca2-16bd-4… Judith …
## 7 EDUCATION false 1 d5e79713-7c77-4f38… 16014961883… --- debate-2… aa3fcd3f-b070-4… Judith …
## 8 COVID RE… false 3 744291c7-6a8c-4f9e… 16014759904… --- debate-2… 6e390d17-9b20-4… Judith …
## 9 ECONOMY false 5 34d9a2b2-d230-485d… 16014521415… --- debate-2… c2db5910-5831-4… Judith …
## 10 HOUSING false 1 b27f66be-93cf-4b4c… 16014576553… --- debate-2… 15f9f1e2-c164-4… Jacinda…
## # … with 990 more rows
import timeit
temp_file = r.temp_file
# just load and deserialize JSON
timeit.Timer('load_and_deserialize(temp_file)',
setup="from __main__ import load_and_deserialize, temp_file") \
.timeit(number=1) * 1000 # milliseconds
## 3.349987993715331
# load, deserialize, and "normalize" into a data frame
timeit.Timer('load_deserialize_and_normalize(temp_file)',
setup="from __main__ import load_deserialize_and_normalize, temp_file") \
.timeit(number=1) * 1000
## 71.6570360091282
res <- microbenchmark::microbenchmark(
R_clean = load_and_unlist(temp_file),
R = fload(temp_file, query = "Items")
,
Py_clean = py_run_string("load_deserialize_and_normalize(temp_file)", convert = FALSE),
Py = py_run_string("load_and_deserialize(temp_file)", convert = FALSE)
)
print(res, order = "median")
## Unit: milliseconds
## expr min lq mean median uq max neval
## R 2.924479 3.297000 3.898203 3.445144 3.733756 16.17030 100
## R_clean 3.309015 3.640663 4.135215 3.763053 4.141021 16.65119 100
## Py 3.860174 4.363253 5.287088 4.773239 5.513256 13.49291 100
## Py_clean 62.174967 64.821467 67.286170 66.720669 69.029512 76.82542 100
print(res, order = "median", unit = "relative")
## Unit: relative
## expr min lq mean median uq max neval
## R 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 100
## R_clean 1.131489 1.104235 1.060800 1.092277 1.109076 1.0297392 100
## Py 1.319953 1.323401 1.356289 1.385498 1.476598 0.8344259 100
## Py_clean 21.260186 19.660742 17.260819 19.366584 18.487955 4.7510214 100
ggplot2::autoplot(res, log = TRUE)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
![](json-benching_files/figure-gfm/unnamed-chunk-9-1.png)
r_df <- load_and_unlist(temp_file)
py_df <- py$py_df
attributes(py_df) <- attributes(r_df)
stopifnot(
identical(py_df, r_df)
)