Last active
April 13, 2020 19:30
-
-
Save liangyy/eae953bfd8af2bab89e18fa870bbabd7 to your computer and use it in GitHub Desktop.
Load GEUVADIS data into R (data url: https://zenodo.org/record/3749838#.XpS15FP0nJ8)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
load_mixqtl_data_bundle = function(data_dir = 'sample_data/', gene_id = 'ENSG00000198951.6', cis_window = 10000){ | |
if(!dir.exists(data_dir)) { | |
message('data_dir: ', data_dir, ' does not exist! Exit!') | |
quit() | |
} | |
haplotype1 = 'geuvadis_22_hap1.txt.gz' | |
haplotype2 = 'geuvadis_22_hap2.txt.gz' | |
total_count = 'expression_count.txt.gz' | |
allele_specific_count_hap1 = 'geuvadis.asc.h1.tsv.gz' | |
allele_specific_count_hap2 = 'geuvadis.asc.h2.tsv.gz' | |
covariate = 'geuvadis.covariate.txt.gz' | |
library_size = 'geuvadis.library_size.tsv.gz' | |
gene_annot = 'gencode.v12.txt.gz' | |
variant_annot = 'geuvadis_22_variant_annotation.txt.gz' | |
# gene_id = 'ENSG00000198951.6' | |
# cis_window = 10000 | |
.load_one = function(data_dir, filename, gene_name) { | |
line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'grep ', gene_id), header = FALSE, sep = '\t', data.table = FALSE) | |
header = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename, ' 2>/dev/null | ', 'head -n 1'), header = FALSE, sep = '\t', data.table = FALSE) | |
colnames(line) = header[1, ] | |
return(line) | |
} | |
.quick_read = function(data_dir, filename) { | |
line = data.table::fread(cmd = paste0('zcat < ', data_dir, '/', filename), header = TRUE, sep = '\t', data.table = FALSE) | |
return(line) | |
} | |
.get_tss = function(s, e, st) { | |
o = s | |
if(!is.null(st)) { | |
o[st == '-'] = e[st == '-'] | |
} | |
return(o) | |
} | |
.rearrage_count_row = function(rr, col_name) { | |
rr = data.frame(indiv = colnames(rr)[-1], value = as.numeric(rr[1, -1])) | |
colnames(rr)[2] = col_name | |
return(rr) | |
} | |
.rearrange_haplotype = function(hap, missing_rate_threshold, indiv_list) { | |
hap_m = t(hap[, -1]) | |
colnames(hap_m) = hap[, 1] | |
high_missing_rate = rowMeans(is.na(hap_m)) > missing_rate_threshold | rowMeans(is.na(hap_m)) > missing_rate_threshold | |
hap_m = hap_m[!high_missing_rate, ] | |
hap_m = hap_m[rownames(hap_m) %in% indiv_list, ] | |
return(hap_m) | |
} | |
# gene meta information | |
message('Loading gene annotation') | |
annot_ = .load_one(data_dir, gene_annot, gene_id) | |
if(annot_$chromosome != 22) { | |
message('The gene is not on chromosome 22. Please use gene on chromosome 22 instead.') | |
quit() | |
} | |
tss_ = .get_tss(annot_$start, annot_$end, annot_$strand) | |
# variant meta information | |
message('Loading variant annotation') | |
var_ = .quick_read(data_dir, variant_annot) | |
cis_var_ = var_[ var_$position >= tss_ - cis_window & var_$position <= tss_ + cis_window, ] | |
# load genotype | |
message('Loading genotype') | |
hap1_ = .quick_read(data_dir, haplotype1) | |
hap2_ = .quick_read(data_dir, haplotype2) | |
hap1_ = hap1_[ hap1_$variant %in% cis_var_$variant, ] | |
hap2_ = hap2_[ hap2_$variant %in% cis_var_$variant, ] | |
# load library size | |
message('Loading library size') | |
libsize_ = .quick_read(data_dir, library_size) | |
# load total count | |
message('Loading total count') | |
trc_ = .load_one(data_dir, total_count, gene_id) | |
trc_ = .rearrage_count_row(trc_, 'trc') | |
# load allele-specific count | |
message('Loading allele-specific count') | |
asc1_ = .load_one(data_dir, allele_specific_count_hap1, gene_id) | |
asc2_ = .load_one(data_dir, allele_specific_count_hap2, gene_id) | |
asc1_ = .rearrage_count_row(asc1_, 'asc1') | |
asc2_ = .rearrage_count_row(asc2_, 'asc2') | |
# counts data.frame | |
message('Composing counts into data.frame') | |
dd = dplyr::inner_join(asc1_, asc2_, by = 'indiv') | |
dd = dplyr::inner_join(dd, trc_, by = 'indiv') | |
dd$indiv = as.character(dd$indiv) | |
dd = dplyr::inner_join(dd, libsize_[, 1:2], by = 'indiv') | |
# covariate | |
message('Loading covariate') | |
covariate_ = .quick_read(data_dir, covariate) | |
# individuals | |
indiv_ids = intersect( | |
colnames(covariate_)[-1], | |
dd$indiv | |
) | |
indiv_ids = intersect( | |
indiv_ids, | |
colnames(hap1_)[-1] | |
) | |
# rearrange genotype | |
message('Rearranging genotype') | |
hap1_mat = .rearrange_haplotype(hap1_, missing_rate_threshold = 0.5, indiv_list = indiv_ids) | |
hap2_mat = .rearrange_haplotype(hap2_, missing_rate_threshold = 0.5, indiv_list = indiv_ids) | |
maf = (colMeans(hap1_mat) + colMeans(hap2_mat)) / 2 | |
hap1_mat = hap1_mat[, maf > 0.01] | |
hap2_mat = hap2_mat[, maf > 0.01] | |
indiv_ids = rownames(hap1_mat) | |
# rearrange covariate | |
message('Rearranging covariate') | |
covariate_ = covariate_[, c(1, match(indiv_ids, colnames(covariate_)[-1]) + 1)] | |
# rearrange count data.frame | |
message('Rearranging count data.frame') | |
dd = dd[match(indiv_ids, dd$indiv), ] | |
return(list(count_df = dd, haplotype1 = hap1_mat, haplotype2 = hap2_mat, covariate_df = covariate_)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# rm(list = ls()) | |
source('https://gist.githubusercontent.com/liangyy/eae953bfd8af2bab89e18fa870bbabd7/raw/a4739882718301138a2e7def1de696e2bc38bad2/load_mixqtl_data_bundle.R') | |
data_bundle = load_mixqtl_data_bundle('~/Downloads/sample_data/') | |
library(dplyr) | |
head(data_bundle$count_df, 3) | |
head(data_bundle$haplotype1[, 1:3], 3) | |
head(data_bundle$haplotype2[, 1:3], 3) | |
head(data_bundle$covariate_df[, 1:3], 3) | |
# mixQTL | |
covariate_offset = mixqtl::regress_against_covariate( | |
trc = data_bundle$count_df$trc, | |
lib_size = data_bundle$count_df$lib_size, | |
covariates = data_bundle$covariate_df | |
) | |
head(covariate_offset, 3) | |
output_mixqtl = mixqtl::mixqtl( | |
geno1 = data_bundle$haplotype1, | |
geno2 = data_bundle$haplotype2, | |
y1 = data_bundle$count_df$asc1, | |
y2 = data_bundle$count_df$asc2, | |
ytotal = data_bundle$count_df$trc, | |
lib_size = data_bundle$count_df$lib_size, | |
cov_offset = covariate_offset | |
) | |
head(output_mixqtl, 3) | |
output_mixfine = mixqtl::mixfine( | |
geno1 = data_bundle$haplotype1, | |
geno2 = data_bundle$haplotype2, | |
y1 = data_bundle$count_df$asc1, | |
y2 = data_bundle$count_df$asc2, | |
ytotal = data_bundle$count_df$trc, | |
lib_size = data_bundle$count_df$lib_size, | |
cov_offset = covariate_offset | |
) | |
head(summary(output_mixfine)$vars, 3) | |
head(summary(output_mixfine)$cs, 3) | |
output_mixpred = mixqtl::mixpred( | |
geno1 = data_bundle$haplotype1, | |
geno2 = data_bundle$haplotype2, | |
y1 = data_bundle$count_df$asc1, | |
y2 = data_bundle$count_df$asc2, | |
ytotal = data_bundle$count_df$trc, | |
lib_size = data_bundle$count_df$lib_size, | |
cov_offset = covariate_offset | |
) | |
head(output_mixpred$beta, 3) | |
y_predict = (data_bundle$haplotype1 + data_bundle$haplotype2) %*% output_mixpred$beta[-1] | |
y_observe = data_bundle$count_df %>% mutate(y = log(trc / lib_size / 2)) %>% pull(y) | |
cor( | |
y_predict[!is.infinite(y_observe)], | |
y_observe[!is.infinite(y_observe)] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment