Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This is I think closest to what was described, however it holds all of the data in memory still...
``` r
library(ipumsr)
ddi <- read_ipums_ddi(ipums_example("cps_00006.xml"))
# (These are just the names for an extract included with ipumsr - they stand in for
# REPWT1-REPWT40, REPWT41-REPWT80, etc.)
var_groups <- list(
HH_RE1 = tidyselect::vars_select(ddi$var_info$var_name, YEAR:SERIAL),
HH_RE2 = tidyselect::vars_select(ddi$var_info$var_name, HWTSUPP:STATEFIP),
PER_RE1 = tidyselect::vars_select(ddi$var_info$var_name, MONTH:PERNUM),
PER_RE2 = tidyselect::vars_select(ddi$var_info$var_name, WTSUPP:INCTOT)
)
f <- function(x, pos) {
out <- purrr::map(var_groups, function(vars) dplyr::select(x, one_of(vars)))
out
}
# Use *list* call back because we have multiple data frames that we don't want
# to combine.
chunked_data <- read_ipums_micro_chunked(
ddi,
IpumsListCallback$new(f),
vars = one_of(unlist(var_groups)),
chunk_size = 1000 # Just to make sure we have multiple chunks for this small example
)
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
# Here's a peak at what the list looks like
str(chunked_data, max.level = 1)
#> List of 8
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
#> $ :List of 4
str(chunked_data[[1]], max.level = 1)
#> List of 4
#> $ HH_RE1 :Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
#> $ HH_RE2 :Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
#> $ PER_RE1:Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
#> $ PER_RE2:Classes 'tbl_df', 'tbl' and 'data.frame': 1000 obs. of 2 variables:
# Now we can combine them into single data frames and save them
# Use ipums_bind_rows() to preserve haven labels
purrr::walk(names(var_groups), function(vg) {
vg_data <- ipums_bind_rows(purrr::map(chunked_data, vg))
saveRDS(vg_data, paste0(vg, ".Rds"))
})
# Have 4 Rds files saved with haven labels
dir(pattern = ".Rds")
#> [1] "HH_RE1.Rds" "HH_RE2.Rds" "PER_RE1.Rds" "PER_RE2.Rds"
readRDS("HH_RE2.Rds")
#> # A tibble: 7,668 x 2
#> HWTSUPP STATEFIP
#> <dbl> <int+lbl>
#> 1 1476. 55 [Wisconsin]
#> 2 1476. 55 [Wisconsin]
#> 3 1476. 55 [Wisconsin]
#> 4 1598. 27 [Minnesota]
#> 5 1707. 27 [Minnesota]
#> 6 1790. 27 [Minnesota]
#> 7 4355. 19 [Iowa]
#> 8 4355. 19 [Iowa]
#> 9 4355. 19 [Iowa]
#> 10 4355. 19 [Iowa]
#> # ... with 7,658 more rows
```
<sup>Created on 2019-02-11 by the [reprex package](https://reprex.tidyverse.org) (v0.2.1)</sup>
This doesn't use chunks, but uses less memory by reading only the variables needed for each file (may take longer because it is reading more from disk)
``` r
library(ipumsr)
ddi <- read_ipums_ddi(ipums_example("cps_00006.xml"))
# (These are just the names for an extract included with ipumsr - they stand in for
# REPWT1-REPWT40, REPWT41-REPWT80, etc.)
var_groups <- list(
HH_RE1 = tidyselect::vars_select(ddi$var_info$var_name, YEAR:SERIAL),
HH_RE2 = tidyselect::vars_select(ddi$var_info$var_name, HWTSUPP:STATEFIP),
PER_RE1 = tidyselect::vars_select(ddi$var_info$var_name, MONTH:PERNUM),
PER_RE2 = tidyselect::vars_select(ddi$var_info$var_name, WTSUPP:INCTOT)
)
purrr::iwalk(var_groups, function(vars, vg_name) {
data <- read_ipums_micro(
ddi,
vars = one_of(vars)
)
saveRDS(data, paste0(vg_name, ".Rds"))
})
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
#>
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
#>
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
#>
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
# Have 4 Rds files saved with haven labels
dir(pattern = ".Rds")
#> [1] "HH_RE1.Rds" "HH_RE2.Rds" "PER_RE1.Rds" "PER_RE2.Rds"
readRDS("HH_RE2.Rds")
#> # A tibble: 7,668 x 2
#> HWTSUPP STATEFIP
#> <dbl> <int+lbl>
#> 1 1476. 55 [Wisconsin]
#> 2 1476. 55 [Wisconsin]
#> 3 1476. 55 [Wisconsin]
#> 4 1598. 27 [Minnesota]
#> 5 1707. 27 [Minnesota]
#> 6 1790. 27 [Minnesota]
#> 7 4355. 19 [Iowa]
#> 8 4355. 19 [Iowa]
#> 9 4355. 19 [Iowa]
#> 10 4355. 19 [Iowa]
#> # ... with 7,658 more rows
```
<sup>Created on 2019-02-11 by the [reprex package](https://reprex.tidyverse.org) (v0.2.1)</sup>
This uses chunks to take the minimum amount of memory - since Rds files cannot be appended to, it uses csvs.
``` r
library(ipumsr)
ddi <- read_ipums_ddi(ipums_example("cps_00006.xml"))
# (These are just the names for an extract included with ipumsr - they stand in for
# REPWT1-REPWT40, REPWT41-REPWT80, etc.)
var_groups <- list(
HH_RE1 = tidyselect::vars_select(ddi$var_info$var_name, YEAR:SERIAL),
HH_RE2 = tidyselect::vars_select(ddi$var_info$var_name, HWTSUPP:STATEFIP),
PER_RE1 = tidyselect::vars_select(ddi$var_info$var_name, MONTH:PERNUM),
PER_RE2 = tidyselect::vars_select(ddi$var_info$var_name, WTSUPP:INCTOT)
)
# Open file connections for each file we want to create
file_connections <- purrr::map(names(var_groups), ~file(paste0(., ".csv"), "w"))
names(file_connections) <- names(var_groups)
# During each chunk, we'll split the variables and save to csv
f <- function(x, pos) {
purrr::iwalk(
var_groups,
function(vars, vg_name) {
readr::write_csv(
dplyr::select(x, one_of(vars)),
file_connections[[vg_name]],
append = pos > 1 # Only colnames on first write
)
})
}
# Use *SideEffect* call back because we're saving to csv during each chunk
# and arne't returning the data.
chunked_data <- read_ipums_micro_chunked(
ddi,
IpumsSideEffectCallback$new(f),
vars = one_of(unlist(var_groups)),
chunk_size = 1000 # Just to make sure we have multiple chunks for this small example
)
#> Use of data from IPUMS-CPS is subject to conditions including that users should
#> cite the data appropriately. Use command `ipums_conditions()` for more details.
purrr::walk(file_connections, ~close(.))
# Have 4 csv files, can use `set_ipums_var_attributes()` to add metadata
dir(pattern = ".csv")
#> [1] "HH_RE1.csv" "HH_RE2.csv" "PER_RE1.csv" "PER_RE2.csv"
data <- readr::read_csv("HH_RE2.csv")
#> Parsed with column specification:
#> cols(
#> HWTSUPP = col_double(),
#> STATEFIP = col_double()
#> )
set_ipums_var_attributes(data, ddi)
#> # A tibble: 7,668 x 2
#> HWTSUPP STATEFIP
#> <dbl> <dbl+lbl>
#> 1 1476. 55 [Wisconsin]
#> 2 1476. 55 [Wisconsin]
#> 3 1476. 55 [Wisconsin]
#> 4 1598. 27 [Minnesota]
#> 5 1707. 27 [Minnesota]
#> 6 1790. 27 [Minnesota]
#> 7 4355. 19 [Iowa]
#> 8 4355. 19 [Iowa]
#> 9 4355. 19 [Iowa]
#> 10 4355. 19 [Iowa]
#> # ... with 7,658 more rows
```
<sup>Created on 2019-02-11 by the [reprex package](https://reprex.tidyverse.org) (v0.2.1)</sup>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.