Skip to content

Instantly share code, notes, and snippets.

@DSuveges
Last active December 5, 2017 12:58
Show Gist options
  • Save DSuveges/c4d5d77aa2bdb3ed6b50cab33c1692a0 to your computer and use it in GitHub Desktop.
Save DSuveges/c4d5d77aa2bdb3ed6b50cab33c1692a0 to your computer and use it in GitHub Desktop.
Analyzing burden test results
# Looking up p-values in the decasted table:
get_pv_cond(){
trait="$1"
gene="$2"
fgrep -e ${trait}-${gene} -e tp /lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/burdentesting/alltraits/merged_missing/Pooled.dcast.txt | ~ag15/transpose.sh | sort -k2,2nr | column -t;
}
# Testing if two variables are in LD in our dataset
# 1) a bcftools view is piped into a plink command, as there is no plink files generated from the POMAK dataset so far.
# 2) the output of the bcftools has to be adjusted as in the vcf file there is no IDs for the variants required by plink.
# 3) plink output is filterd by perl to give the highest r2:
get_r2(){
var1=$1
var2=$2
# Folder for the vcf files:
vcfDir=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/release/postrelease_missingnessfilter
# Get chromosome:
chr=$(echo $var1 | cut -d: -f1 | sed -e 's/chr//')
# Get r2 for the two variants:
r2=$(plink -vcf <(bcftools view -r ${var1},${var2} -Ov -o - ${vcfDir}/chr${chr}.missingfiltered-0.01_consequences.vcf.gz | awk 'BEGIN{IFS=OFS="\t"}{if($0 ~ "#"){print $0} else {$3 = sprintf("%s:%s", $1, $2); print $0}}') --ld $var1 $var2 | grep -i -e "R-sq" | perl -lane '($r2) = $_ =~ /R-sq = (\S+)/; push(@a, $r2) if $r2; END{ print join "/", @a}')
# Output:
echo -e "${var1}/${var2}\t${r2:-NA}"
}
# Get the lowest p-values for a given gene across all traits in the POMAK dataset (excluding linsight.):
get_HP_signif_gene(){
HPSource=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HP/burdentesting/alltraits/merged_missing/Pooled.dcast.txt
gene=${1}
grep -e tp -e ${gene} ${HPSource} | perl -lane 'if ($. == 1){@b = @F; print "Trait-gene\tCondition\tpval"}else{$mi = 0; $mv = 0; for ($i = 1; $i <= scalar(@F); $i ++){next if $F[$i] == "NA"; if($F[$i] > $mv){ $mi = $i; $mv = $F[$i]}}; print join "\t", $F[0], $b[$mi], sprintf("%.2e", 10**(-1 * $mv))}' | sort -k3,3g | grep -v sight
}
# Get the lowest p-values for a given gene across all traits in the MANOLIS dataset (excluding linsight.):
get_HA_signif_gene(){
HASource=/lustre/scratch115/realdata/mdt0/projects/t144_helic_15x/analysis/HA/burdentesting/arthur_rerun/dcasted_results_andoldtraits.header_fixed.txt
gene=${1}
grep -e tp -e ${gene} ${HASource} | perl -lane 'if ($. == 1){@b = @F; print "Trait-gene\tCondition\tpval"}else{$mi = 0; $mv = 0; for ($i = 1; $i <= scalar(@F); $i ++){next if $F[$i] == "NA"; if($F[$i] > $mv){ $mi = $i; $mv = $F[$i]}}; print join "\t", $F[0], $b[$mi], sprintf("%.2e", 10**(-1 * $mv))}' | sort -k3,3g | grep -v sight
}
# Compare the the most significant traits in HA and HP dataset:
gene=HNRNPH1
cat <(echo "HP_trait_gene HP_con HP_pval HA_trait_gene HA_cond HA_pval") <(join <(grep -w -f <(cat <( get_HP_signif_gene ${gene} ) <(get_HA_signif_gene ${gene} ) | awk '$3 < 0.05 {print $1}' | sort -u | tr '[:lower:]' '[:upper:]') <(get_HP_signif_gene ${gene} | tr '[:lower:]' '[:upper:]') | sort) <(grep -w -f <(cat <( get_HP_signif_gene ${gene} ) <(get_HA_signif_gene ${gene} ) | awk '$3 < 0.05 {print $1}' | sort -u | tr '[:lower:]' '[:upper:]') <(get_HA_signif_gene ${gene} | tr '[:lower:]' '[:upper:]') | sort) -e "NA" -o '1.1,1.2,1.3,2.1,2.2,2.3' -a1 -a2 | sort -k3,3g) | column -t
# Extracting all consequences of a variant that belongs to a specific gene.
# It is important if multiple genes are overlapping. This quiery does not take the appris annotation into consideration
echo "FAM189B chr1:155247925 G/A" | sed -e 's/chr//; s/\:/ /; s/\// /' | while read gene chr pos ref alt; do wget -q --header='Content-type:application/json' --header='Accept:application/json' http://rest.ensembl.org/vep/human/region/${chr}:${pos}-${pos}:1/${alt} -O - | jq -r --arg gene "$gene" ' .[] | "\(.id) \(.colocated_variants[0].id) \([.transcript_consequences[] | select(.gene_symbol == $gene) | .consequence_terms[] ] | unique | join(","))"'; done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment