Created
August 21, 2017 00:13
-
-
Save szilard/b82635fa9060227514af3423b3225a29 to your computer and use it in GitHub Desktop.
Dataset sizes in OpenML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# OpenML Benchmarking Suites and the OpenML100 | |
# https://arxiv.org/abs/1708.03731 | |
# https://www.openml.org/s/14/data | |
library(OpenML) | |
ids <- getOMLStudy('OpenML100')$data$data.id | |
dsall <- listOMLDataSets() | |
sum(dsall$data.id %in% ids) ## 96??? | |
ds <- dsall[dsall$data.id %in% ids,] | |
median(ds$number.of.instances) | |
sum(ds$number.of.instances>=10000) | |
sum(ds$number.of.instances>=40000) | |
ds[ds$number.of.instances>=40000,] | |
#> median(ds$number.of.instances) | |
#[1] 2054.5 | |
#> sum(ds$number.of.instances>=10000) | |
#[1] 21 | |
#> sum(ds$number.of.instances>=40000) | |
#[1] 8 | |
#> ds[ds$number.of.instances>=40000,c(2,10:12)] | |
# name number.of.classes number.of.features | |
#101 electricity 2 9 | |
#413 mnist_784 10 785 | |
#955 KDDCup09_churn 2 231 | |
#957 KDDCup09_upselling 2 231 | |
#1165 bank-marketing 2 17 | |
#1207 tamilnadu-electricity 20 4 | |
#1287 adult 2 15 | |
#2550 higgs 2 29 | |
# number.of.instances | |
#101 45312 | |
#413 70000 | |
#955 50000 | |
#957 50000 | |
#1165 45211 | |
#1207 45781 | |
#1287 48842 | |
#2550 98050 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment