Last active
December 5, 2017 12:58
-
-
Save DSuveges/c4d5d77aa2bdb3ed6b50cab33c1692a0 to your computer and use it in GitHub Desktop.
Analyzing burden test results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Looking up p-values in the decasted table: | |
get_pv_cond(){ | |
trait="$1" | |
gene="$2" | |
fgrep -e ${trait}-${gene} -e tp /lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/burdentesting/alltraits/merged_missing/Pooled.dcast.txt | ~ag15/transpose.sh | sort -k2,2nr | column -t; | |
} | |
# Testing if two variables are in LD in our dataset | |
# 1) a bcftools view is piped into a plink command, as there is no plink files generated from the POMAK dataset so far. | |
# 2) the output of the bcftools has to be adjusted as in the vcf file there is no IDs for the variants required by plink. | |
# 3) plink output is filterd by perl to give the highest r2: | |
get_r2(){ | |
var1=$1 | |
var2=$2 | |
# Folder for the vcf files: | |
vcfDir=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/release/postrelease_missingnessfilter | |
# Get chromosome: | |
chr=$(echo $var1 | cut -d: -f1 | sed -e 's/chr//') | |
# Get r2 for the two variants: | |
r2=$(plink -vcf <(bcftools view -r ${var1},${var2} -Ov -o - ${vcfDir}/chr${chr}.missingfiltered-0.01_consequences.vcf.gz | awk 'BEGIN{IFS=OFS="\t"}{if($0 ~ "#"){print $0} else {$3 = sprintf("%s:%s", $1, $2); print $0}}') --ld $var1 $var2 | grep -i -e "R-sq" | perl -lane '($r2) = $_ =~ /R-sq = (\S+)/; push(@a, $r2) if $r2; END{ print join "/", @a}') | |
# Output: | |
echo -e "${var1}/${var2}\t${r2:-NA}" | |
} | |
# Get the lowest p-values for a given gene across all traits in the POMAK dataset (excluding linsight.): | |
get_HP_signif_gene(){ | |
HPSource=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/burdentesting/alltraits/merged_missing/Pooled.dcast.txt | |
gene=${1} | |
grep -e tp -e ${gene} ${HPSource} | perl -lane 'if ($. == 1){@b = @F; print "Trait-gene\tCondition\tpval"}else{$mi = 0; $mv = 0; for ($i = 1; $i <= scalar(@F); $i ++){next if $F[$i] == "NA"; if($F[$i] > $mv){ $mi = $i; $mv = $F[$i]}}; print join "\t", $F[0], $b[$mi], sprintf("%.2e", 10**(-1 * $mv))}' | sort -k3,3g | grep -v sight | |
} | |
# Get the lowest p-values for a given gene across all traits in the MANOLIS dataset (excluding linsight.): | |
get_HA_signif_gene(){ | |
HASource=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HA/burdentesting/arthur_rerun/dcasted_results_andoldtraits.header_fixed.txt | |
gene=${1} | |
grep -e tp -e ${gene} ${HASource} | perl -lane 'if ($. == 1){@b = @F; print "Trait-gene\tCondition\tpval"}else{$mi = 0; $mv = 0; for ($i = 1; $i <= scalar(@F); $i ++){next if $F[$i] == "NA"; if($F[$i] > $mv){ $mi = $i; $mv = $F[$i]}}; print join "\t", $F[0], $b[$mi], sprintf("%.2e", 10**(-1 * $mv))}' | sort -k3,3g | grep -v sight | |
} | |
# Compare the the most significant traits in HA and HP dataset: | |
gene=HNRNPH1 | |
cat <(echo "HP_trait_gene HP_con HP_pval HA_trait_gene HA_cond HA_pval") <(join <(grep -w -f <(cat <( get_HP_signif_gene ${gene} ) <(get_HA_signif_gene ${gene} ) | awk '$3 < 0.05 {print $1}' | sort -u | tr '[:lower:]' '[:upper:]') <(get_HP_signif_gene ${gene} | tr '[:lower:]' '[:upper:]') | sort) <(grep -w -f <(cat <( get_HP_signif_gene ${gene} ) <(get_HA_signif_gene ${gene} ) | awk '$3 < 0.05 {print $1}' | sort -u | tr '[:lower:]' '[:upper:]') <(get_HA_signif_gene ${gene} | tr '[:lower:]' '[:upper:]') | sort) -e "NA" -o '1.1,1.2,1.3,2.1,2.2,2.3' -a1 -a2 | sort -k3,3g) | column -t | |
# Extracting all consequences of a variant that belongs to a specific gene. | |
# It is important if multiple genes are overlapping. This quiery does not take the appris annotation into consideration | |
echo "FAM189B chr1:155247925 G/A" | sed -e 's/chr//; s/\:/ /; s/\// /' | while read gene chr pos ref alt; do wget -q --header='Content-type:application/json' --header='Accept:application/json' http://rest.ensembl.org/vep/human/region/${chr}:${pos}-${pos}:1/${alt} -O - | jq -r --arg gene "$gene" ' .[] | "\(.id) \(.colocated_variants[0].id) \([.transcript_consequences[] | select(.gene_symbol == $gene) | .consequence_terms[] ] | unique | join(","))"'; done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment