janxkoci/fasta2tsv.r

## fasta2tsv.r
#!/usr/bin/env Rscript

# library(seqinr)

## ARGS
args <- commandArgs(trailingOnly=TRUE)
if (length(args) != 2) stop("Two input files must be supplied.", call.=FALSE)

## INPUT
fastafile <- grep(".fa", args, val=T) # args[1] # "mez1_chr21-22.fa"
snpfile <- grep(".tsv", args, val=T) # args[2] # "hgdp_chr21-22_pos.tsv"

if (length(fastafile) != 1) stop("Check your fastafile! [.fa|.fasta]", call.=FALSE)
if (length(snpfile) != 1) stop("Check your snpfile! [.tsv]", call.=FALSE)

fastaname <- tools::file_path_sans_ext(fastafile) # basename(fastafile)
# snpname <- tools::file_path_sans_ext(snpfile)

if(grepl(".gz", fastafile)) {
	fastaname <- tools::file_path_sans_ext(fastafile, compression = T) # basename(fastafile)
	fastafile <- gzfile(fastafile)
}
if(grepl(".gz", snpfile)) {
	snpfile <- gzfile(snpfile)
	# snpname <- tools::file_path_sans_ext(snpfile, compression = T)
}

## READ DATA
fas <- seqinr::read.fasta(fastafile, forceDNAtolower = F)
sites <- read.table(snpfile, sep="\t")

sites[,3] <- NA
colnames(sites) <- c("CHROM", "POS", basename(fastaname))

## LOOP
chroms <- unique(sites[,1])

for(chr in chroms) {
    chrs <- as.character(chr)
    pos <- sites[sites[,1]==chrs, 2]

    sites[which(sites[,1]==chr),3] <- fas[[chrs]][pos]
}

## MASK lowercase BASES
# sites[which(grepl("[acgtn-]", sites[,3])),] <- "N" # TEST THIS

## alternatively mask later with awk
## awk '$3~/[acgtn-]/ {$3="N"}1'

## WRITE OUTPUT
# write.csv(sites, paste0(fastaname, ".csv"))
write.table(sites, paste0(fastaname, ".tsv"), quote=F, sep="\t", row.names=F)

## tsv2vcf.1sample.awk
#!/usr/bin/awk -f

## PROCESS LowCov ARCHAICS
## reads file with 4 columns:
## chrom pos hg19.REF archaic.GT

BEGIN {
OFS="\t"

## VCF HEADER
print "##fileformat=VCFv4.2"
print "##source=tsv2vcf.1sample.awk"
print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
# TODO "getline FILENAME" to extract contig names
# actually bcftools merge will do it for you, so don't bother
}

## VCF COLUMNS
NR==1 {print "#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",$4}

## MASK LQ bases with "."
NR>1 {sub(/[acgtn-]/,".",$4)}

## CONVERT GTs to VCF style format
NR>1 {
	if($4==".")
		gt="./."
	else if($4==toupper($3))
		{gt="0/0"; $4="."}
	else
		gt="1/1"
}

## PRINT VCF-like format
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
NR>1 {print $1,$2, ".", $3,$4,".",".","HG","GT",gt}

## tsv2vcf.2apes.awk
#!/usr/bin/awk -f

## PROCESS apes
## reads file with 5 columns:
## chrom pos hg19.REF chimp.GT gorilla.GT
BEGIN {
	OFS = "\t"

	## VCF HEADER
	print "##fileformat=VCFv4.2"
	print "##source=tsv2vcf.2apes.awk"
	print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
	print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
	# TODO "getline FILENAME" to extract contig names
	# actually bcftools merge will do it for you, so don't bother
}

## VCF COLUMNS
NR == 1 {
	print "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", $4, $5
	next
}

## MASK MISSING bases with "."
NR > 1 {
	for (i = 4; i <= NF; i++) {
		sub(/[n-]/, ".", $i)
	}
}

## MASK LQ bases with "."
## but only at mismatch sites
toupper($4) != toupper($5) {
	for (i = 4; i <= NF; i++) {
		sub(/[acgt]/, ".", $i)
	}
}

## CONVERT GTs to VCF style format
## matching GTs
toupper($4) == toupper($5) {
	gts = toupper($4$5)
	if (gts ~ /\./) {
		chimp = "./."
		gorilla = "./."
		alt = "."
	} else if (gts ~ toupper($3)) {
		chimp = "0/0"
		gorilla = "0/0"
		alt = "."
	} else {
		chimp = "1/1"
		gorilla = "1/1"
		alt = toupper($4)
	}
}

## non-matching GTs
toupper($4) != toupper($5) {
	gts = toupper($4$5)
	if (gts ~ /\./ && gts ~ toupper($3)) {
		for (i = 4; i <= 5; i++) {
			sub(/\./, "./.", $i)
			sub(/[a-zA-Z]/, "0/0", $i)
		}
		chimp = $4
		gorilla = $5
		alt = "."
	} else if (gts ~ /\./ && gts !~ toupper($3)) {
		for (i = 4; i <= 5; i++) {
			sub(/\./, "./.", $i)
			sub(/[a-zA-Z]/, "1/1", $i)
		}
		chimp = $4
		gorilla = $5
		alt = gensub(/\./, "", 1, gts)
	} else if (gts ~ toupper($3)) {
		for (i = 4; i <= 5; i++) {
			sub(toupper($3), "0/0", $i)
			alt = gensub(toupper($3), "", 1, gts)
			sub(alt, "1/1", $i)
		}
		chimp = $4
		gorilla = $5
		#alt = gensub(toupper($3), "", 1, gts)
	} else {
		chimp = "1/1"
		gorilla = "2/2"
		alt = $4","$5
	}
}

## PRINT VCF-like format
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
NR > 1 {
	print $1, $2, ".", $3, alt, ".", ".", "HG", "GT", chimp, gorilla
}
	#!/usr/bin/env Rscript

	# library(seqinr)

	## ARGS
	args <- commandArgs(trailingOnly=TRUE)
	if (length(args) != 2) stop("Two input files must be supplied.", call.=FALSE)

	## INPUT
	fastafile <- grep(".fa", args, val=T) # args[1] # "mez1_chr21-22.fa"
	snpfile <- grep(".tsv", args, val=T) # args[2] # "hgdp_chr21-22_pos.tsv"

	if (length(fastafile) != 1) stop("Check your fastafile! [.fa\|.fasta]", call.=FALSE)
	if (length(snpfile) != 1) stop("Check your snpfile! [.tsv]", call.=FALSE)

	fastaname <- tools::file_path_sans_ext(fastafile) # basename(fastafile)
	# snpname <- tools::file_path_sans_ext(snpfile)

	if(grepl(".gz", fastafile)) {
	fastaname <- tools::file_path_sans_ext(fastafile, compression = T) # basename(fastafile)
	fastafile <- gzfile(fastafile)
	}
	if(grepl(".gz", snpfile)) {
	snpfile <- gzfile(snpfile)
	# snpname <- tools::file_path_sans_ext(snpfile, compression = T)
	}

	## READ DATA
	fas <- seqinr::read.fasta(fastafile, forceDNAtolower = F)
	sites <- read.table(snpfile, sep="\t")

	sites[,3] <- NA
	colnames(sites) <- c("CHROM", "POS", basename(fastaname))

	## LOOP
	chroms <- unique(sites[,1])

	for(chr in chroms) {
	chrs <- as.character(chr)
	pos <- sites[sites[,1]==chrs, 2]

	sites[which(sites[,1]==chr),3] <- fas[[chrs]][pos]
	}

	## MASK lowercase BASES
	# sites[which(grepl("[acgtn-]", sites[,3])),] <- "N" # TEST THIS

	## alternatively mask later with awk
	## awk '$3~/[acgtn-]/ {$3="N"}1'

	## WRITE OUTPUT
	# write.csv(sites, paste0(fastaname, ".csv"))
	write.table(sites, paste0(fastaname, ".tsv"), quote=F, sep="\t", row.names=F)
	#!/usr/bin/awk -f

	## PROCESS LowCov ARCHAICS
	## reads file with 4 columns:
	## chrom pos hg19.REF archaic.GT

	BEGIN {
	OFS="\t"

	## VCF HEADER
	print "##fileformat=VCFv4.2"
	print "##source=tsv2vcf.1sample.awk"
	print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
	print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
	# TODO "getline FILENAME" to extract contig names
	# actually bcftools merge will do it for you, so don't bother
	}

	## VCF COLUMNS
	NR==1 {print "#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",$4}

	## MASK LQ bases with "."
	NR>1 {sub(/[acgtn-]/,".",$4)}

	## CONVERT GTs to VCF style format
	NR>1 {
	if($4==".")
	gt="./."
	else if($4==toupper($3))
	{gt="0/0"; $4="."}
	else
	gt="1/1"
	}

	## PRINT VCF-like format
	# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
	NR>1 {print $1,$2, ".", $3,$4,".",".","HG","GT",gt}
	#!/usr/bin/awk -f

	## PROCESS apes
	## reads file with 5 columns:
	## chrom pos hg19.REF chimp.GT gorilla.GT
	BEGIN {
	OFS = "\t"

	## VCF HEADER
	print "##fileformat=VCFv4.2"
	print "##source=tsv2vcf.2apes.awk"
	print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
	print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
	# TODO "getline FILENAME" to extract contig names
	# actually bcftools merge will do it for you, so don't bother
	}

	## VCF COLUMNS
	NR == 1 {
	print "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", $4, $5
	next
	}

	## MASK MISSING bases with "."
	NR > 1 {
	for (i = 4; i <= NF; i++) {
	sub(/[n-]/, ".", $i)
	}
	}

	## MASK LQ bases with "."
	## but only at mismatch sites
	toupper($4) != toupper($5) {
	for (i = 4; i <= NF; i++) {
	sub(/[acgt]/, ".", $i)
	}
	}

	## CONVERT GTs to VCF style format
	## matching GTs
	toupper($4) == toupper($5) {
	gts = toupper($4$5)
	if (gts ~ /\./) {
	chimp = "./."
	gorilla = "./."
	alt = "."
	} else if (gts ~ toupper($3)) {
	chimp = "0/0"
	gorilla = "0/0"
	alt = "."
	} else {
	chimp = "1/1"
	gorilla = "1/1"
	alt = toupper($4)
	}
	}

	## non-matching GTs
	toupper($4) != toupper($5) {
	gts = toupper($4$5)
	if (gts ~ /\./ && gts ~ toupper($3)) {
	for (i = 4; i <= 5; i++) {
	sub(/\./, "./.", $i)
	sub(/[a-zA-Z]/, "0/0", $i)
	}
	chimp = $4
	gorilla = $5
	alt = "."
	} else if (gts ~ /\./ && gts !~ toupper($3)) {
	for (i = 4; i <= 5; i++) {
	sub(/\./, "./.", $i)
	sub(/[a-zA-Z]/, "1/1", $i)
	}
	chimp = $4
	gorilla = $5
	alt = gensub(/\./, "", 1, gts)
	} else if (gts ~ toupper($3)) {
	for (i = 4; i <= 5; i++) {
	sub(toupper($3), "0/0", $i)
	alt = gensub(toupper($3), "", 1, gts)
	sub(alt, "1/1", $i)
	}
	chimp = $4
	gorilla = $5
	#alt = gensub(toupper($3), "", 1, gts)
	} else {
	chimp = "1/1"
	gorilla = "2/2"
	alt = $4","$5
	}
	}

	## PRINT VCF-like format
	# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
	NR > 1 {
	print $1, $2, ".", $3, alt, ".", ".", "HG", "GT", chimp, gorilla
	}