Skip to content

Instantly share code, notes, and snippets.

@janxkoci
Last active August 9, 2022 11:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save janxkoci/eba4583d5739eb5426e14401d0026b04 to your computer and use it in GitHub Desktop.
Save janxkoci/eba4583d5739eb5426e14401d0026b04 to your computer and use it in GitHub Desktop.
Hetfa tools - convert hetfa (fasta) to tsv and to vcf.
#!/usr/bin/env Rscript
# library(seqinr)
## ARGS
args <- commandArgs(trailingOnly=TRUE)
if (length(args) != 2) stop("Two input files must be supplied.", call.=FALSE)
## INPUT
fastafile <- grep(".fa", args, val=T) # args[1] # "mez1_chr21-22.fa"
snpfile <- grep(".tsv", args, val=T) # args[2] # "hgdp_chr21-22_pos.tsv"
if (length(fastafile) != 1) stop("Check your fastafile! [.fa|.fasta]", call.=FALSE)
if (length(snpfile) != 1) stop("Check your snpfile! [.tsv]", call.=FALSE)
fastaname <- tools::file_path_sans_ext(fastafile) # basename(fastafile)
# snpname <- tools::file_path_sans_ext(snpfile)
if(grepl(".gz", fastafile)) {
fastaname <- tools::file_path_sans_ext(fastafile, compression = T) # basename(fastafile)
fastafile <- gzfile(fastafile)
}
if(grepl(".gz", snpfile)) {
snpfile <- gzfile(snpfile)
# snpname <- tools::file_path_sans_ext(snpfile, compression = T)
}
## READ DATA
fas <- seqinr::read.fasta(fastafile, forceDNAtolower = F)
sites <- read.table(snpfile, sep="\t")
sites[,3] <- NA
colnames(sites) <- c("CHROM", "POS", basename(fastaname))
## LOOP
chroms <- unique(sites[,1])
for(chr in chroms) {
chrs <- as.character(chr)
pos <- sites[sites[,1]==chrs, 2]
sites[which(sites[,1]==chr),3] <- fas[[chrs]][pos]
}
## MASK lowercase BASES
# sites[which(grepl("[acgtn-]", sites[,3])),] <- "N" # TEST THIS
## alternatively mask later with awk
## awk '$3~/[acgtn-]/ {$3="N"}1'
## WRITE OUTPUT
# write.csv(sites, paste0(fastaname, ".csv"))
write.table(sites, paste0(fastaname, ".tsv"), quote=F, sep="\t", row.names=F)
#!/usr/bin/awk -f
## PROCESS LowCov ARCHAICS
## reads file with 4 columns:
## chrom pos hg19.REF archaic.GT
BEGIN {
OFS="\t"
## VCF HEADER
print "##fileformat=VCFv4.2"
print "##source=tsv2vcf.1sample.awk"
print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
# TODO "getline FILENAME" to extract contig names
# actually bcftools merge will do it for you, so don't bother
}
## VCF COLUMNS
NR==1 {print "#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",$4}
## MASK LQ bases with "."
NR>1 {sub(/[acgtn-]/,".",$4)}
## CONVERT GTs to VCF style format
NR>1 {
if($4==".")
gt="./."
else if($4==toupper($3))
{gt="0/0"; $4="."}
else
gt="1/1"
}
## PRINT VCF-like format
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
NR>1 {print $1,$2, ".", $3,$4,".",".","HG","GT",gt}
#!/usr/bin/awk -f
## PROCESS apes
## reads file with 5 columns:
## chrom pos hg19.REF chimp.GT gorilla.GT
BEGIN {
OFS = "\t"
## VCF HEADER
print "##fileformat=VCFv4.2"
print "##source=tsv2vcf.2apes.awk"
print "##INFO=<ID=HG,Number=0,Type=Flag,Description=\"REF allele based on hg19 genome assembly. Repeats are lowercase.\">"
print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
# TODO "getline FILENAME" to extract contig names
# actually bcftools merge will do it for you, so don't bother
}
## VCF COLUMNS
NR == 1 {
print "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", $4, $5
next
}
## MASK MISSING bases with "."
NR > 1 {
for (i = 4; i <= NF; i++) {
sub(/[n-]/, ".", $i)
}
}
## MASK LQ bases with "."
## but only at mismatch sites
toupper($4) != toupper($5) {
for (i = 4; i <= NF; i++) {
sub(/[acgt]/, ".", $i)
}
}
## CONVERT GTs to VCF style format
## matching GTs
toupper($4) == toupper($5) {
gts = toupper($4$5)
if (gts ~ /\./) {
chimp = "./."
gorilla = "./."
alt = "."
} else if (gts ~ toupper($3)) {
chimp = "0/0"
gorilla = "0/0"
alt = "."
} else {
chimp = "1/1"
gorilla = "1/1"
alt = toupper($4)
}
}
## non-matching GTs
toupper($4) != toupper($5) {
gts = toupper($4$5)
if (gts ~ /\./ && gts ~ toupper($3)) {
for (i = 4; i <= 5; i++) {
sub(/\./, "./.", $i)
sub(/[a-zA-Z]/, "0/0", $i)
}
chimp = $4
gorilla = $5
alt = "."
} else if (gts ~ /\./ && gts !~ toupper($3)) {
for (i = 4; i <= 5; i++) {
sub(/\./, "./.", $i)
sub(/[a-zA-Z]/, "1/1", $i)
}
chimp = $4
gorilla = $5
alt = gensub(/\./, "", 1, gts)
} else if (gts ~ toupper($3)) {
for (i = 4; i <= 5; i++) {
sub(toupper($3), "0/0", $i)
alt = gensub(toupper($3), "", 1, gts)
sub(alt, "1/1", $i)
}
chimp = $4
gorilla = $5
#alt = gensub(toupper($3), "", 1, gts)
} else {
chimp = "1/1"
gorilla = "2/2"
alt = $4","$5
}
}
## PRINT VCF-like format
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
NR > 1 {
print $1, $2, ".", $3, alt, ".", ".", "HG", "GT", chimp, gorilla
}
@janxkoci
Copy link
Author

janxkoci commented Apr 28, 2022

Apparently, I could have used code like this in a few places:

chimp=gorilla="./."

Neat. Well, maybe next time 😄

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment