Adam X. Maihofer nievergeltlab

## missing_a2
zcat /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.gz | awk '{print $2,$4,$5,$7}' | LC_ALL=C sort -k1b,1 > /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.afs


LC_ALL=C join -1 1 -2 2 <(cat /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.afs) <(LC_ALL=C sort -k2b,2 95_biom/PTSD_broad_EA_clean.head.assoc.logistic) > 95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged


awk '{FRQ=$4; A1=$7; if(A1==$2) { A2=$3}  else if(A1 != $2){ A2=$2; FRQ=1-$4}; print $0,A2,FRQ}'   95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged |  grep -v -E "A T$|T A$|C G$|G C$" > 95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged.gwas1


## contrastrg.r
library(data.table)

d1 <- fread('rgdifferences.csv',data.table=F)

#772 traits, so 0.05/772 = 6.476684e-05
#I like it better with all correlatios shown. I';ll grey out the NS ones!
d1$color <- "grey"
d1[which(d1$p <= 6.476684e-05 | d1$p_trauma <= 6.476684e-05),]$color <- "black" #significant in at least one


## annotate.sh
#LC_ALL=C join  <(awk '{print $1}' w_hm3.snplist.sorted)  <(awk '{print $22,$1}' eur_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.txt.annotated | LC_ALL=C sort -k1b,1) | awk '{print $2,$1}' | LC_ALL=C sort -k1b,1 > w_hm3.snplist.linked

for anc in his # eur aam asn his #  "$anc"
do

#important to use ref and alt1 for this!
cat vcf_header1b.txt vcf_header2.txt <(awk '{OFS="\t"}{if (NR>1) print $1,$2, ".", $4,$5 , "100", "PASS", "MVP="$3}' "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa |  sort    -n -k1r,1 -k2,2  ) > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf

bcftools view  "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf -Oz -o "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz

## mvp annotate
#LC_ALL=C join  <(awk '{print $1}' w_hm3.snplist.sorted)  <(awk '{print $22,$1}' eur_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.txt.annotated | LC_ALL=C sort -k1b,1) | awk '{print $2,$1}' | LC_ALL=C sort -k1b,1 > w_hm3.snplist.linked

for anc in eur aam asn his #  "$anc"
do

#important to use ref and alt1 for this!
cat vcf_header1b.txt vcf_header2.txt <(awk '{OFS="\t"}{if (NR>1) print $1,$2, ".", $4,$5 , "100", "PASS", "MVP="$3}' "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa |  sort    -n -k1r,1 -k2,2  ) > "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf

bcftools view  "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf -Oz -o "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf.gz

## lanc.r
#Do a kcolumn split of local ancestry
#Do a delimiting of | then kcolumn split of haplotypes vcf

 zcat gtpc_phased_chr22.vcf.gz | tail -n+11 | sed 's/|/\t/g' > gtpc_phased_chr22.vcf.haps

 zcat gtpc_phased_chr22.vcf.gz  | head -n10 | tail -n1   | sed 's/#//g' | awk '{$1=$1; print}'  > gtpc_phased_chr22.vcf.haps.header

#Notice that the odd/even selection REMOVES the columns, rather than selecting them, hence the file names are haps1 and haps0
awk '{ for (i=10;i<=NF;i+=2)  $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps1
awk '{ for (i=11;i<=NF;i+=2)  $i="" } 1' gtpc_phased_chr22.vcf.haps | cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps0

## dosage_merge.sh
#!/bin/bash


#THe first step may be to just filter down the list of subjects.

#cat  betr/qc1/dos_pts_betr_mix_am-qc.hg19.ch.fl.chr10_009_012.out.dosage.fam vets/qc1/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_009_012.out.dosage.fam > test.fam


## combine_sexes_x_chromsome_2.txt
#!/bin/bash

#Big note:
#The plink --fam d.fam --dosage c.txt list sepheader Zin --write-dosage  command doesnt seem to work to combine male and female files
#Doing this will make each SNP appear to be a different SNP for the male and female files (in other words, N SNPs doubles, and will be ALL NA for
#either men or women
#Instead do the traditional command saying that the subjects are split but not hte SNPs, but make a temporary file where the SNPs are subset to onyl the
#intersecting set!

#Instead

## find_ams.sh
### Find ancestry informative markers, using 1000 Genomes reference data as an example

#Use global ancestry as a phenotype for association analysis in unrelated KGP subjects
#Perform LD based clumping so that in each LD block, the strongest AIMs are selected
#Making sure that no indels or ambiguous markers are used
#Liftover markers (if you care)

#Get subject populations

#Extract only AFR and EA samples from the 1000G VCF data, on non monomorphic markers

## rename_dups.r
#Rename duplicate rs-ids in a .bim file
#Sometimes you get .bim files where rs-ids are not unique. This renames the duplicates. Preference to not rename is given in order of first marker listed.
#V1 - Oct 28, 2016
args <- commandArgs(trailingOnly = TRUE)
qced_bim <- args[1]


print("Reading .bim file")
 dat1 <- read.table(qced_bim,stringsAsFactors=F,header=F) #Load the QCed SNP list
 names(dat1) <- c("CHR","SNP","cM","BP","A1","A2")

## derive_virtual.sh

study=prom

for chr in {1..22}
do
plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr
done
	zcat /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.gz \| awk '{print $2,$4,$5,$7}' \| LC_ALL=C sort -k1b,1 > /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.afs


	LC_ALL=C join -1 1 -2 2 <(cat /mnt/ukbb/adam/ptsd/ehr_reformat/74_dai2/daner_iPSYCH2015_PTSDbroad_chr1to22_HRC_MAF01.afs) <(LC_ALL=C sort -k2b,2 95_biom/PTSD_broad_EA_clean.head.assoc.logistic) > 95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged


	awk '{FRQ=$4; A1=$7; if(A1==$2) { A2=$3} else if(A1 != $2){ A2=$2; FRQ=1-$4}; print $0,A2,FRQ}' 95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged \| grep -v -E "A T$\|T A$\|C G$\|G C$" > 95_biom/PTSD_broad_EA_clean.head.assoc.logistic.merged.gwas1
	library(data.table)

	d1 <- fread('rgdifferences.csv',data.table=F)

	#772 traits, so 0.05/772 = 6.476684e-05
	#I like it better with all correlatios shown. I';ll grey out the NS ones!
	d1$color <- "grey"
	d1[which(d1$p <= 6.476684e-05 \| d1$p_trauma <= 6.476684e-05),]$color <- "black" #significant in at least one
	#LC_ALL=C join <(awk '{print $1}' w_hm3.snplist.sorted) <(awk '{print $22,$1}' eur_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.txt.annotated \| LC_ALL=C sort -k1b,1) \| awk '{print $2,$1}' \| LC_ALL=C sort -k1b,1 > w_hm3.snplist.linked

	for anc in his # eur aam asn his # "$anc"
	do

	#important to use ref and alt1 for this!
	cat vcf_header1b.txt vcf_header2.txt <(awk '{OFS="\t"}{if (NR>1) print $1,$2, ".", $4,$5 , "100", "PASS", "MVP="$3}' "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa \| sort -n -k1r,1 -k2,2 ) > "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf

	bcftools view "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf -Oz -o "$anc"_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.vcf.gz
	#LC_ALL=C join <(awk '{print $1}' w_hm3.snplist.sorted) <(awk '{print $22,$1}' eur_broad_mar172020_allchr.broad_tinnitus.maf01.resultsa.txt.annotated \| LC_ALL=C sort -k1b,1) \| awk '{print $2,$1}' \| LC_ALL=C sort -k1b,1 > w_hm3.snplist.linked

	for anc in eur aam asn his # "$anc"
	do

	#important to use ref and alt1 for this!
	cat vcf_header1b.txt vcf_header2.txt <(awk '{OFS="\t"}{if (NR>1) print $1,$2, ".", $4,$5 , "100", "PASS", "MVP="$3}' "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa \| sort -n -k1r,1 -k2,2 ) > "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf

	bcftools view "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf -Oz -o "$anc"_broad_mar172020_allchr.hearing.maf01.resultsa.vcf.gz
	#Do a kcolumn split of local ancestry
	#Do a delimiting of \| then kcolumn split of haplotypes vcf

	zcat gtpc_phased_chr22.vcf.gz \| tail -n+11 \| sed 's/\|/\t/g' > gtpc_phased_chr22.vcf.haps

	zcat gtpc_phased_chr22.vcf.gz \| head -n10 \| tail -n1 \| sed 's/#//g' \| awk '{$1=$1; print}' > gtpc_phased_chr22.vcf.haps.header

	#Notice that the odd/even selection REMOVES the columns, rather than selecting them, hence the file names are haps1 and haps0
	awk '{ for (i=10;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps \| cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps1
	awk '{ for (i=11;i<=NF;i+=2) $i="" } 1' gtpc_phased_chr22.vcf.haps \| cat gtpc_phased_chr22.vcf.haps.header - > gtpc_phased_chr22.vcf.haps0
	#!/bin/bash




	#THe first step may be to just filter down the list of subjects.

	#cat betr/qc1/dos_pts_betr_mix_am-qc.hg19.ch.fl.chr10_009_012.out.dosage.fam vets/qc1/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_009_012.out.dosage.fam > test.fam
	#!/bin/bash

	#Big note:
	#The plink --fam d.fam --dosage c.txt list sepheader Zin --write-dosage command doesnt seem to work to combine male and female files
	#Doing this will make each SNP appear to be a different SNP for the male and female files (in other words, N SNPs doubles, and will be ALL NA for
	#either men or women
	#Instead do the traditional command saying that the subjects are split but not hte SNPs, but make a temporary file where the SNPs are subset to onyl the
	#intersecting set!

	#Instead
	### Find ancestry informative markers, using 1000 Genomes reference data as an example

	#Use global ancestry as a phenotype for association analysis in unrelated KGP subjects
	#Perform LD based clumping so that in each LD block, the strongest AIMs are selected
	#Making sure that no indels or ambiguous markers are used
	#Liftover markers (if you care)

	#Get subject populations

	#Extract only AFR and EA samples from the 1000G VCF data, on non monomorphic markers
	#Rename duplicate rs-ids in a .bim file
	#Sometimes you get .bim files where rs-ids are not unique. This renames the duplicates. Preference to not rename is given in order of first marker listed.
	#V1 - Oct 28, 2016
	args <- commandArgs(trailingOnly = TRUE)
	qced_bim <- args[1]


	print("Reading .bim file")
	dat1 <- read.table(qced_bim,stringsAsFactors=F,header=F) #Load the QCed SNP list
	names(dat1) <- c("CHR","SNP","cM","BP","A1","A2")

	study=prom

	for chr in {1..22}
	do
	plink2 --bfile pts_PROM_mix_am-qc --chr $chr --make-bed --out data/"$study"_$chr
	done