Skip to content

Instantly share code, notes, and snippets.

@liangyy
Last active April 13, 2020 19:30
Show Gist options
  • Save liangyy/eae953bfd8af2bab89e18fa870bbabd7 to your computer and use it in GitHub Desktop.
Save liangyy/eae953bfd8af2bab89e18fa870bbabd7 to your computer and use it in GitHub Desktop.
Load GEUVADIS data into R (data url: https://zenodo.org/record/3749838#.XpS15FP0nJ8)
load_mixqtl_data_bundle = function(data_dir = 'sample_data/', gene_id = 'ENSG00000198951.6', cis_window = 10000){
if(!dir.exists(data_dir)) {
message('data_dir: ', data_dir, ' does not exist! Exit!')
quit()
}
haplotype1 = 'geuvadis_22_hap1.txt.gz'
haplotype2 = 'geuvadis_22_hap2.txt.gz'
total_count = 'expression_count.txt.gz'
allele_specific_count_hap1 = 'geuvadis.asc.h1.tsv.gz'
allele_specific_count_hap2 = 'geuvadis.asc.h2.tsv.gz'
covariate = 'geuvadis.covariate.txt.gz'
library_size = 'geuvadis.library_size.tsv.gz'
gene_annot = 'gencode.v12.txt.gz'
variant_annot = 'geuvadis_22_variant_annotation.txt.gz'
# gene_id = 'ENSG00000198951.6'
# cis_window = 10000
.load_one = function(data_dir, filename, gene_name) {
line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'grep ', gene_id), header = FALSE, sep = '\t', data.table = FALSE)
header = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'head -n 1'), header = FALSE, sep = '\t', data.table = FALSE)
colnames(line) = header[1, ]
return(line)
}
.quick_read = function(data_dir, filename) {
line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename), header = TRUE, sep = '\t', data.table = FALSE)
return(line)
}
.get_tss = function(s, e, st) {
o = s
if(!is.null(st)) {
o[st == '-'] = e[st == '-']
}
return(o)
}
.rearrage_count_row = function(rr, col_name) {
rr = data.frame(indiv = colnames(rr)[-1], value = as.numeric(rr[1, -1]))
colnames(rr)[2] = col_name
return(rr)
}
.rearrange_haplotype = function(hap, missing_rate_threshold, indiv_list) {
hap_m = t(hap[, -1])
colnames(hap_m) = hap[, 1]
high_missing_rate = rowMeans(is.na(hap_m)) > missing_rate_threshold | rowMeans(is.na(hap_m)) > missing_rate_threshold
hap_m = hap_m[!high_missing_rate, ]
hap_m = hap_m[rownames(hap_m) %in% indiv_list, ]
return(hap_m)
}
# gene meta information
message('Loading gene annotation')
annot_ = .load_one(data_dir, gene_annot, gene_id)
if(annot_$chromosome != 22) {
message('The gene is not on chromosome 22. Please use gene on chromosome 22 instead.')
quit()
}
tss_ = .get_tss(annot_$start, annot_$end, annot_$strand)
# variant meta information
message('Loading variant annotation')
var_ = .quick_read(data_dir, variant_annot)
cis_var_ = var_[ var_$position >= tss_ - cis_window & var_$position <= tss_ + cis_window, ]
# load genotype
message('Loading genotype')
hap1_ = .quick_read(data_dir, haplotype1)
hap2_ = .quick_read(data_dir, haplotype2)
hap1_ = hap1_[ hap1_$variant %in% cis_var_$variant, ]
hap2_ = hap2_[ hap2_$variant %in% cis_var_$variant, ]
# load library size
message('Loading library size')
libsize_ = .quick_read(data_dir, library_size)
# load total count
message('Loading total count')
trc_ = .load_one(data_dir, total_count, gene_id)
trc_ = .rearrage_count_row(trc_, 'trc')
# load allele-specific count
message('Loading allele-specific count')
asc1_ = .load_one(data_dir, allele_specific_count_hap1, gene_id)
asc2_ = .load_one(data_dir, allele_specific_count_hap2, gene_id)
asc1_ = .rearrage_count_row(asc1_, 'asc1')
asc2_ = .rearrage_count_row(asc2_, 'asc2')
# counts data.frame
message('Composing counts into data.frame')
dd = dplyr::inner_join(asc1_, asc2_, by = 'indiv')
dd = dplyr::inner_join(dd, trc_, by = 'indiv')
dd$indiv = as.character(dd$indiv)
dd = dplyr::inner_join(dd, libsize_[, 1:2], by = 'indiv')
# covariate
message('Loading covariate')
covariate_ = .quick_read(data_dir, covariate)
# individuals
indiv_ids = intersect(
colnames(covariate_)[-1],
dd$indiv
)
indiv_ids = intersect(
indiv_ids,
colnames(hap1_)[-1]
)
# rearrange genotype
message('Rearranging genotype')
hap1_mat = .rearrange_haplotype(hap1_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
hap2_mat = .rearrange_haplotype(hap2_, missing_rate_threshold = 0.5, indiv_list = indiv_ids)
maf = (colMeans(hap1_mat) + colMeans(hap2_mat)) / 2
hap1_mat = hap1_mat[, maf > 0.01]
hap2_mat = hap2_mat[, maf > 0.01]
indiv_ids = rownames(hap1_mat)
# rearrange covariate
message('Rearranging covariate')
covariate_ = covariate_[, c(1, match(indiv_ids, colnames(covariate_)[-1]) + 1)]
# rearrange count data.frame
message('Rearranging count data.frame')
dd = dd[match(indiv_ids, dd$indiv), ]
return(list(count_df = dd, haplotype1 = hap1_mat, haplotype2 = hap2_mat, covariate_df = covariate_))
}
# rm(list = ls())
source('https://gist.githubusercontent.com/liangyy/eae953bfd8af2bab89e18fa870bbabd7/raw/a4739882718301138a2e7def1de696e2bc38bad2/load_mixqtl_data_bundle.R')
data_bundle = load_mixqtl_data_bundle('~/Downloads/sample_data/')
library(dplyr)
head(data_bundle$count_df, 3)
head(data_bundle$haplotype1[, 1:3], 3)
head(data_bundle$haplotype2[, 1:3], 3)
head(data_bundle$covariate_df[, 1:3], 3)
# mixQTL
covariate_offset = mixqtl::regress_against_covariate(
trc = data_bundle$count_df$trc,
lib_size = data_bundle$count_df$lib_size,
covariates = data_bundle$covariate_df
)
head(covariate_offset, 3)
output_mixqtl = mixqtl::mixqtl(
geno1 = data_bundle$haplotype1,
geno2 = data_bundle$haplotype2,
y1 = data_bundle$count_df$asc1,
y2 = data_bundle$count_df$asc2,
ytotal = data_bundle$count_df$trc,
lib_size = data_bundle$count_df$lib_size,
cov_offset = covariate_offset
)
head(output_mixqtl, 3)
output_mixfine = mixqtl::mixfine(
geno1 = data_bundle$haplotype1,
geno2 = data_bundle$haplotype2,
y1 = data_bundle$count_df$asc1,
y2 = data_bundle$count_df$asc2,
ytotal = data_bundle$count_df$trc,
lib_size = data_bundle$count_df$lib_size,
cov_offset = covariate_offset
)
head(summary(output_mixfine)$vars, 3)
head(summary(output_mixfine)$cs, 3)
output_mixpred = mixqtl::mixpred(
geno1 = data_bundle$haplotype1,
geno2 = data_bundle$haplotype2,
y1 = data_bundle$count_df$asc1,
y2 = data_bundle$count_df$asc2,
ytotal = data_bundle$count_df$trc,
lib_size = data_bundle$count_df$lib_size,
cov_offset = covariate_offset
)
head(output_mixpred$beta, 3)
y_predict = (data_bundle$haplotype1 + data_bundle$haplotype2) %*% output_mixpred$beta[-1]
y_observe = data_bundle$count_df %>% mutate(y = log(trc / lib_size / 2)) %>% pull(y)
cor(
y_predict[!is.infinite(y_observe)],
y_observe[!is.infinite(y_observe)]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment