nevrome/R_stream_experiment.R

## R_stream_experiment.R
# get some large EIGENSTRAT format data from
# https://edmond.mpdl.mpg.de/imeji/collection/OBNPDy16CfZWPhz0?q=
ind_file <- tempfile()
geno_file <- tempfile()
download.file(
    "https://edmond.mpdl.mpg.de/imeji/file/d7/32/34/70-96ac-4a64-8eb2-c7d6db355d4a/0/original/77fa944dc4369018c6478dcec3f6ced6.txt?filename=Wang2019Caucasus.ind.txt",
  destfile = ind_file
)
download.file(
  "https://edmond.mpdl.mpg.de/imeji/file/ea/ea/02/b4-4124-43f4-a458-3cf728f8e7bc/0/original/2f35c3d890e8456ac634d18ee45e6c27.txt?filename=Wang2019Caucasus.geno.txt",
  destfile = geno_file
)

# read small ind file to extract the individual names
inds <- read.csv(ind_file, sep = "\t", header = F)

# create an LaF connection (https://github.com/djvanderlaan/LaF) to the large .geno file
laf_geno <- LaF::laf_open_fwf(
  geno_file,
  column_types = rep("integer", nrow(inds)),
  column_widths = rep(1, nrow(inds)),
  column_names = inds[,1]
)

# read .geno file in chunks of 5000 lines
chunked::read_laf_chunkwise(laf_geno, chunk_size = 5000) |>
  # fold chunk-wise to count non-missing SNPs (!=9)
  # dplyr::summarise is actually overwritten by chunked:::summarise.chunkwise
  dplyr::summarise(
    dplyr::across(.fns = \(x) sum(x != 9))
  ) |>
  # transform intermediate result to tibble
  # (chunked is lazy, so the calculation starts only here)
  tibble::as.tibble() |>
  # complete the chunk-wise fold to a total sum
  dplyr::summarise(
    dplyr::across(.fns = \(x) sum(x))
  )
	# get some large EIGENSTRAT format data from
	# https://edmond.mpdl.mpg.de/imeji/collection/OBNPDy16CfZWPhz0?q=
	ind_file <- tempfile()
	geno_file <- tempfile()
	download.file(
	"https://edmond.mpdl.mpg.de/imeji/file/d7/32/34/70-96ac-4a64-8eb2-c7d6db355d4a/0/original/77fa944dc4369018c6478dcec3f6ced6.txt?filename=Wang2019Caucasus.ind.txt",
	destfile = ind_file
	)
	download.file(
	"https://edmond.mpdl.mpg.de/imeji/file/ea/ea/02/b4-4124-43f4-a458-3cf728f8e7bc/0/original/2f35c3d890e8456ac634d18ee45e6c27.txt?filename=Wang2019Caucasus.geno.txt",
	destfile = geno_file
	)

	# read small ind file to extract the individual names
	inds <- read.csv(ind_file, sep = "\t", header = F)

	# create an LaF connection (https://github.com/djvanderlaan/LaF) to the large .geno file
	laf_geno <- LaF::laf_open_fwf(
	geno_file,
	column_types = rep("integer", nrow(inds)),
	column_widths = rep(1, nrow(inds)),
	column_names = inds[,1]
	)

	# read .geno file in chunks of 5000 lines
	chunked::read_laf_chunkwise(laf_geno, chunk_size = 5000) \|>
	# fold chunk-wise to count non-missing SNPs (!=9)
	# dplyr::summarise is actually overwritten by chunked:::summarise.chunkwise
	dplyr::summarise(
	dplyr::across(.fns = \(x) sum(x != 9))
	) \|>
	# transform intermediate result to tibble
	# (chunked is lazy, so the calculation starts only here)
	tibble::as.tibble() \|>
	# complete the chunk-wise fold to a total sum
	dplyr::summarise(
	dplyr::across(.fns = \(x) sum(x))
	)