liangyy/load_mixqtl_data_bundle.R

## load_mixqtl_data_bundle.R
load_mixqtl_data_bundle = function(data_dir = 'sample_data/', gene_id = 'ENSG00000198951.6', cis_window = 10000){
  if(!dir.exists(data_dir)) {
    message('data_dir: ', data_dir, ' does not exist! Exit!')
    quit()
  }
  haplotype1 = 'geuvadis_22_hap1.txt.gz'
  haplotype2 = 'geuvadis_22_hap2.txt.gz'
  total_count = 'expression_count.txt.gz'
  allele_specific_count_hap1 = 'geuvadis.asc.h1.tsv.gz'
  allele_specific_count_hap2 = 'geuvadis.asc.h2.tsv.gz'
  covariate = 'geuvadis.covariate.txt.gz'
  library_size = 'geuvadis.library_size.tsv.gz'
  gene_annot = 'gencode.v12.txt.gz'
  variant_annot = 'geuvadis_22_variant_annotation.txt.gz'

  # gene_id = 'ENSG00000198951.6'
  # cis_window = 10000

  .load_one = function(data_dir, filename, gene_name) {
    line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'grep ', gene_id), header = FALSE, sep = '\t', data.table = FALSE)
    header = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'head -n 1'), header = FALSE, sep = '\t', data.table = FALSE)
    colnames(line) = header[1, ]
    return(line)
  }
  .quick_read = function(data_dir, filename) {
    line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename), header = TRUE, sep = '\t', data.table = FALSE)
    return(line)
  }
  .get_tss = function(s, e, st) {
    o = s
    if(!is.null(st)) {
      o[st == '-'] = e[st == '-']
    }
    return(o)
  }
  .rearrage_count_row = function(rr, col_name) {
    rr = data.frame(indiv = colnames(rr)[-1], value = as.numeric(rr[1, -1]))
    colnames(rr)[2] = col_name
    return(rr)
  }
  .rearrange_haplotype = function(hap, missing_rate_threshold, indiv_list) {
    hap_m = t(hap[, -1])
    colnames(hap_m) = hap[, 1]
    high_missing_rate = rowMeans(is.na(hap_m)) > missing_rate_threshold | rowMeans(is.na(hap_m)) > missing_rate_threshold
    hap_m = hap_m[!high_missing_rate, ]
    hap_m = hap_m[rownames(hap_m) %in% indiv_list, ]
    return(hap_m)
  }

  # gene meta information
  message('Loading gene annotation')
  annot_ = .load_one(data_dir, gene_annot, gene_id)
  if(annot_$chromosome != 22) {
    message('The gene is not on chromosome 22. Please use gene on chromosome 22 instead.')
    quit()
  }
  tss_ = .get_tss(annot_$start, annot_$end, annot_$strand)

  # variant meta information
  message('Loading variant annotation')
  var_ = .quick_read(data_dir, variant_annot)
  cis_var_ = var_[ var_$position >= tss_ - cis_window & var_$position <= tss_ + cis_window, ]

  # load genotype
  message('Loading genotype')
  hap1_ = .quick_read(data_dir, haplotype1)
  hap2_ = .quick_read(data_dir, haplotype2)
  hap1_ = hap1_[ hap1_$variant %in% cis_var_$variant, ]
  hap2_ = hap2_[ hap2_$variant %in% cis_var_$variant, ]

  # load library size
  message('Loading library size')
  libsize_ = .quick_read(data_dir, library_size)

  # load total count
  message('Loading total count')
  trc_ = .load_one(data_dir, total_count, gene_id)
  trc_ = .rearrage_count_row(trc_, 'trc')

  # load allele-specific count
  message('Loading allele-specific count')
  asc1_ = .load_one(data_dir, allele_specific_count_hap1, gene_id)
  asc2_ = .load_one(data_dir, allele_specific_count_hap2, gene_id)
  asc1_ = .rearrage_count_row(asc1_, 'asc1')
  asc2_ = .rearrage_count_row(asc2_, 'asc2')

  # counts data.frame
  message('Composing counts into data.frame')
  dd = dplyr::inner_join(asc1_, asc2_, by = 'indiv')
  dd = dplyr::inner_join(dd, trc_, by = 'indiv')
  dd$indiv = as.character(dd$indiv)
  dd = dplyr::inner_join(dd, libsize_[, 1:2], by = 'indiv')

  # covariate
  message('Loading covariate')
  covariate_ = .quick_read(data_dir, covariate)

  # individuals
  indiv_ids = intersect(
    colnames(covariate_)[-1],
    dd$indiv
  )
  indiv_ids = intersect(
    indiv_ids,
    colnames(hap1_)[-1]
  )

  # rearrange genotype
  message('Rearranging genotype')
  hap1_mat = .rearrange_haplotype(hap1_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
  hap2_mat = .rearrange_haplotype(hap2_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
  maf = (colMeans(hap1_mat) + colMeans(hap2_mat)) / 2
  hap1_mat = hap1_mat[, maf > 0.01]
  hap2_mat = hap2_mat[, maf > 0.01]
  indiv_ids = rownames(hap1_mat)

  # rearrange covariate
  message('Rearranging covariate')
  covariate_ = covariate_[, c(1, match(indiv_ids, colnames(covariate_)[-1]) + 1)]

  # rearrange count data.frame
  message('Rearranging count data.frame')
  dd = dd[match(indiv_ids, dd$indiv), ]

  return(list(count_df = dd, haplotype1 = hap1_mat, haplotype2 = hap2_mat, covariate_df = covariate_))
}

## tutorial_mixqtl_r_functions.R
# rm(list = ls())
source('https://gist.githubusercontent.com/liangyy/eae953bfd8af2bab89e18fa870bbabd7/raw/a4739882718301138a2e7def1de696e2bc38bad2/load_mixqtl_data_bundle.R')

data_bundle = load_mixqtl_data_bundle('~/Downloads/sample_data/')

library(dplyr)

head(data_bundle$count_df, 3)
head(data_bundle$haplotype1[, 1:3], 3)
head(data_bundle$haplotype2[, 1:3], 3)
head(data_bundle$covariate_df[, 1:3], 3)

# mixQTL
covariate_offset = mixqtl::regress_against_covariate(
  trc = data_bundle$count_df$trc,
  lib_size = data_bundle$count_df$lib_size,
  covariates = data_bundle$covariate_df
)
head(covariate_offset, 3)

output_mixqtl = mixqtl::mixqtl(
  geno1 = data_bundle$haplotype1,
  geno2 = data_bundle$haplotype2,
  y1 = data_bundle$count_df$asc1,
  y2 = data_bundle$count_df$asc2,
  ytotal = data_bundle$count_df$trc,
  lib_size = data_bundle$count_df$lib_size,
  cov_offset = covariate_offset
)
head(output_mixqtl, 3)

output_mixfine = mixqtl::mixfine(
  geno1 = data_bundle$haplotype1,
  geno2 = data_bundle$haplotype2,
  y1 = data_bundle$count_df$asc1,
  y2 = data_bundle$count_df$asc2,
  ytotal = data_bundle$count_df$trc,
  lib_size = data_bundle$count_df$lib_size,
  cov_offset = covariate_offset
)
head(summary(output_mixfine)$vars, 3)
head(summary(output_mixfine)$cs, 3)

output_mixpred = mixqtl::mixpred(
  geno1 = data_bundle$haplotype1,
  geno2 = data_bundle$haplotype2,
  y1 = data_bundle$count_df$asc1,
  y2 = data_bundle$count_df$asc2,
  ytotal = data_bundle$count_df$trc,
  lib_size = data_bundle$count_df$lib_size,
  cov_offset = covariate_offset
)
head(output_mixpred$beta, 3)
y_predict = (data_bundle$haplotype1 + data_bundle$haplotype2) %*% output_mixpred$beta[-1]
y_observe = data_bundle$count_df %>% mutate(y = log(trc / lib_size / 2)) %>% pull(y)
cor(
  y_predict[!is.infinite(y_observe)],
  y_observe[!is.infinite(y_observe)]
)
	load_mixqtl_data_bundle = function(data_dir = 'sample_data/', gene_id = 'ENSG00000198951.6', cis_window = 10000){
	if(!dir.exists(data_dir)) {
	message('data_dir: ', data_dir, ' does not exist! Exit!')
	quit()
	}
	haplotype1 = 'geuvadis_22_hap1.txt.gz'
	haplotype2 = 'geuvadis_22_hap2.txt.gz'
	total_count = 'expression_count.txt.gz'
	allele_specific_count_hap1 = 'geuvadis.asc.h1.tsv.gz'
	allele_specific_count_hap2 = 'geuvadis.asc.h2.tsv.gz'
	covariate = 'geuvadis.covariate.txt.gz'
	library_size = 'geuvadis.library_size.tsv.gz'
	gene_annot = 'gencode.v12.txt.gz'
	variant_annot = 'geuvadis_22_variant_annotation.txt.gz'

	# gene_id = 'ENSG00000198951.6'
	# cis_window = 10000

	.load_one = function(data_dir, filename, gene_name) {
	line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null \| ', 'grep ', gene_id), header = FALSE, sep = '\t', data.table = FALSE)
	header = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null \| ', 'head -n 1'), header = FALSE, sep = '\t', data.table = FALSE)
	colnames(line) = header[1, ]
	return(line)
	}
	.quick_read = function(data_dir, filename) {
	line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename), header = TRUE, sep = '\t', data.table = FALSE)
	return(line)
	}
	.get_tss = function(s, e, st) {
	o = s
	if(!is.null(st)) {
	o[st == '-'] = e[st == '-']
	}
	return(o)
	}
	.rearrage_count_row = function(rr, col_name) {
	rr = data.frame(indiv = colnames(rr)[-1], value = as.numeric(rr[1, -1]))
	colnames(rr)[2] = col_name
	return(rr)
	}
	.rearrange_haplotype = function(hap, missing_rate_threshold, indiv_list) {
	hap_m = t(hap[, -1])
	colnames(hap_m) = hap[, 1]
	high_missing_rate = rowMeans(is.na(hap_m)) > missing_rate_threshold \| rowMeans(is.na(hap_m)) > missing_rate_threshold
	hap_m = hap_m[!high_missing_rate, ]
	hap_m = hap_m[rownames(hap_m) %in% indiv_list, ]
	return(hap_m)
	}

	# gene meta information
	message('Loading gene annotation')
	annot_ = .load_one(data_dir, gene_annot, gene_id)
	if(annot_$chromosome != 22) {
	message('The gene is not on chromosome 22. Please use gene on chromosome 22 instead.')
	quit()
	}
	tss_ = .get_tss(annot_$start, annot_$end, annot_$strand)

	# variant meta information
	message('Loading variant annotation')
	var_ = .quick_read(data_dir, variant_annot)
	cis_var_ = var_[ var_$position >= tss_ - cis_window & var_$position <= tss_ + cis_window, ]

	# load genotype
	message('Loading genotype')
	hap1_ = .quick_read(data_dir, haplotype1)
	hap2_ = .quick_read(data_dir, haplotype2)
	hap1_ = hap1_[ hap1_$variant %in% cis_var_$variant, ]
	hap2_ = hap2_[ hap2_$variant %in% cis_var_$variant, ]

	# load library size
	message('Loading library size')
	libsize_ = .quick_read(data_dir, library_size)

	# load total count
	message('Loading total count')
	trc_ = .load_one(data_dir, total_count, gene_id)
	trc_ = .rearrage_count_row(trc_, 'trc')

	# load allele-specific count
	message('Loading allele-specific count')
	asc1_ = .load_one(data_dir, allele_specific_count_hap1, gene_id)
	asc2_ = .load_one(data_dir, allele_specific_count_hap2, gene_id)
	asc1_ = .rearrage_count_row(asc1_, 'asc1')
	asc2_ = .rearrage_count_row(asc2_, 'asc2')

	# counts data.frame
	message('Composing counts into data.frame')
	dd = dplyr::inner_join(asc1_, asc2_, by = 'indiv')
	dd = dplyr::inner_join(dd, trc_, by = 'indiv')
	dd$indiv = as.character(dd$indiv)
	dd = dplyr::inner_join(dd, libsize_[, 1:2], by = 'indiv')

	# covariate
	message('Loading covariate')
	covariate_ = .quick_read(data_dir, covariate)

	# individuals
	indiv_ids = intersect(
	colnames(covariate_)[-1],
	dd$indiv
	)
	indiv_ids = intersect(
	indiv_ids,
	colnames(hap1_)[-1]
	)

	# rearrange genotype
	message('Rearranging genotype')
	hap1_mat = .rearrange_haplotype(hap1_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
	hap2_mat = .rearrange_haplotype(hap2_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
	maf = (colMeans(hap1_mat) + colMeans(hap2_mat)) / 2
	hap1_mat = hap1_mat[, maf > 0.01]
	hap2_mat = hap2_mat[, maf > 0.01]
	indiv_ids = rownames(hap1_mat)

	# rearrange covariate
	message('Rearranging covariate')
	covariate_ = covariate_[, c(1, match(indiv_ids, colnames(covariate_)[-1]) + 1)]

	# rearrange count data.frame
	message('Rearranging count data.frame')
	dd = dd[match(indiv_ids, dd$indiv), ]

	return(list(count_df = dd, haplotype1 = hap1_mat, haplotype2 = hap2_mat, covariate_df = covariate_))
	}
	# rm(list = ls())
	source('https://gist.githubusercontent.com/liangyy/eae953bfd8af2bab89e18fa870bbabd7/raw/a4739882718301138a2e7def1de696e2bc38bad2/load_mixqtl_data_bundle.R')

	data_bundle = load_mixqtl_data_bundle('~/Downloads/sample_data/')

	library(dplyr)

	head(data_bundle$count_df, 3)
	head(data_bundle$haplotype1[, 1:3], 3)
	head(data_bundle$haplotype2[, 1:3], 3)
	head(data_bundle$covariate_df[, 1:3], 3)

	# mixQTL
	covariate_offset = mixqtl::regress_against_covariate(
	trc = data_bundle$count_df$trc,
	lib_size = data_bundle$count_df$lib_size,
	covariates = data_bundle$covariate_df
	)
	head(covariate_offset, 3)

	output_mixqtl = mixqtl::mixqtl(
	geno1 = data_bundle$haplotype1,
	geno2 = data_bundle$haplotype2,
	y1 = data_bundle$count_df$asc1,
	y2 = data_bundle$count_df$asc2,
	ytotal = data_bundle$count_df$trc,
	lib_size = data_bundle$count_df$lib_size,
	cov_offset = covariate_offset
	)
	head(output_mixqtl, 3)

	output_mixfine = mixqtl::mixfine(
	geno1 = data_bundle$haplotype1,
	geno2 = data_bundle$haplotype2,
	y1 = data_bundle$count_df$asc1,
	y2 = data_bundle$count_df$asc2,
	ytotal = data_bundle$count_df$trc,
	lib_size = data_bundle$count_df$lib_size,
	cov_offset = covariate_offset
	)
	head(summary(output_mixfine)$vars, 3)
	head(summary(output_mixfine)$cs, 3)

	output_mixpred = mixqtl::mixpred(
	geno1 = data_bundle$haplotype1,
	geno2 = data_bundle$haplotype2,
	y1 = data_bundle$count_df$asc1,
	y2 = data_bundle$count_df$asc2,
	ytotal = data_bundle$count_df$trc,
	lib_size = data_bundle$count_df$lib_size,
	cov_offset = covariate_offset
	)
	head(output_mixpred$beta, 3)
	y_predict = (data_bundle$haplotype1 + data_bundle$haplotype2) %*% output_mixpred$beta[-1]
	y_observe = data_bundle$count_df %>% mutate(y = log(trc / lib_size / 2)) %>% pull(y)
	cor(
	y_predict[!is.infinite(y_observe)],
	y_observe[!is.infinite(y_observe)]
	)