gergness/survey_arrange.Rmd

## survey_arrange.Rmd
``` r
suppressPackageStartupMessages({
  library(survey)
  library(srvyr)
  library(dplyr)
})
# stratified sample from examples
data(api)
dstrata <- apistrat %>%
  as_survey(strata = stype, weights = pw)

# The data is now stored in dstrata$variables
dstrata$variables %>%
  select(stype, api99, pw)
#> # A tibble: 200 x 3
#>     stype api99    pw
#>  * <fctr> <int> <dbl>
#>  1      E   816 44.21
#>  2      E   476 44.21
#>  3      E   544 44.21
#>  4      E   457 44.21
#>  5      E   659 44.21
#>  6      E   780 44.21
#>  7      E   787 44.21
#>  8      E   731 44.21
#>  9      E   508 44.21
#> 10      E   658 44.21
#> # ... with 190 more rows

# And the weighting information in dstrata$prob
head(dstrata$prob)
#>          1          2          3          4          5          6
#> 0.02261932 0.02261932 0.02261932 0.02261932 0.02261932 0.02261932

# prob is equal to 1/weight
all(dstrata$prob == (1 / dstrata$variables$pw))
#> [1] TRUE


# We get the same results if we arrange before creating object
dstrata_rearranged <- apistrat %>%
  arrange(api99) %>%
  as_survey(strata = stype, weights = pw)

dstrata_rearranged %>%
  group_by(stype) %>%
  summarize(api99 = survey_mean(api99))
#> # A tibble: 3 x 3
#>    stype  api99 api99_se
#>   <fctr>  <dbl>    <dbl>
#> 1      E 635.87 13.33941
#> 2      H 617.36 15.80575
#> 3      M 610.20 16.75894

dstrata %>%
  group_by(stype) %>%
  summarize(api99 = survey_mean(api99))
#> # A tibble: 3 x 3
#>    stype  api99 api99_se
#>   <fctr>  <dbl>    <dbl>
#> 1      E 635.87 13.33941
#> 2      H 617.36 15.80575
#> 3      M 610.20 16.75894


# But if we arrange only the data without also rearranging the weighting information
# we get wrong results
dstrata_bad <- dstrata
dstrata_bad$variables <- dstrata_bad$variables %>%
  arrange(api99)

dstrata_bad %>%
  group_by(stype) %>%
  summarize(api99 = survey_mean(api99))
#> # A tibble: 3 x 3
#>    stype    api99 api99_se
#>   <fctr>    <dbl>    <dbl>
#> 1      E 624.0336 14.98117
#> 2      H 611.6568 17.38391
#> 3      M 608.0925 17.86467


# When I wrote that vignette, I didn't understand that the survey package
# has a method for `[` that allows for this kind of rearrangement.
# I think I should add it to srvyr
dstrata_fixed <- dstrata
dstrata_fixed <- dstrata_fixed[order(dstrata_fixed$variables$api99)]

dstrata_fixed %>%
  group_by(stype) %>%
  summarize(api99 = survey_mean(api99))
#> # A tibble: 3 x 3
#>    stype  api99 api99_se
#>   <fctr>  <dbl>    <dbl>
#> 1      E 635.87 13.33941
#> 2      H 617.36 15.80575
#> 3      M 610.20 16.75894
```
	``` r
	suppressPackageStartupMessages({
	library(survey)
	library(srvyr)
	library(dplyr)
	})
	# stratified sample from examples
	data(api)
	dstrata <- apistrat %>%
	as_survey(strata = stype, weights = pw)

	# The data is now stored in dstrata$variables
	dstrata$variables %>%
	select(stype, api99, pw)
	#> # A tibble: 200 x 3
	#> stype api99 pw
	#> * <fctr> <int> <dbl>
	#> 1 E 816 44.21
	#> 2 E 476 44.21
	#> 3 E 544 44.21
	#> 4 E 457 44.21
	#> 5 E 659 44.21
	#> 6 E 780 44.21
	#> 7 E 787 44.21
	#> 8 E 731 44.21
	#> 9 E 508 44.21
	#> 10 E 658 44.21
	#> # ... with 190 more rows

	# And the weighting information in dstrata$prob
	head(dstrata$prob)
	#> 1 2 3 4 5 6
	#> 0.02261932 0.02261932 0.02261932 0.02261932 0.02261932 0.02261932

	# prob is equal to 1/weight
	all(dstrata$prob == (1 / dstrata$variables$pw))
	#> [1] TRUE


	# We get the same results if we arrange before creating object
	dstrata_rearranged <- apistrat %>%
	arrange(api99) %>%
	as_survey(strata = stype, weights = pw)

	dstrata_rearranged %>%
	group_by(stype) %>%
	summarize(api99 = survey_mean(api99))
	#> # A tibble: 3 x 3
	#> stype api99 api99_se
	#> <fctr> <dbl> <dbl>
	#> 1 E 635.87 13.33941
	#> 2 H 617.36 15.80575
	#> 3 M 610.20 16.75894

	dstrata %>%
	group_by(stype) %>%
	summarize(api99 = survey_mean(api99))
	#> # A tibble: 3 x 3
	#> stype api99 api99_se
	#> <fctr> <dbl> <dbl>
	#> 1 E 635.87 13.33941
	#> 2 H 617.36 15.80575
	#> 3 M 610.20 16.75894


	# But if we arrange only the data without also rearranging the weighting information
	# we get wrong results
	dstrata_bad <- dstrata
	dstrata_bad$variables <- dstrata_bad$variables %>%
	arrange(api99)

	dstrata_bad %>%
	group_by(stype) %>%
	summarize(api99 = survey_mean(api99))
	#> # A tibble: 3 x 3
	#> stype api99 api99_se
	#> <fctr> <dbl> <dbl>
	#> 1 E 624.0336 14.98117
	#> 2 H 611.6568 17.38391
	#> 3 M 608.0925 17.86467


	# When I wrote that vignette, I didn't understand that the survey package
	# has a method for `[` that allows for this kind of rearrangement.
	# I think I should add it to srvyr
	dstrata_fixed <- dstrata
	dstrata_fixed <- dstrata_fixed[order(dstrata_fixed$variables$api99)]

	dstrata_fixed %>%
	group_by(stype) %>%
	summarize(api99 = survey_mean(api99))
	#> # A tibble: 3 x 3
	#> stype api99 api99_se
	#> <fctr> <dbl> <dbl>
	#> 1 E 635.87 13.33941
	#> 2 H 617.36 15.80575
	#> 3 M 610.20 16.75894
	```