Created
January 24, 2019 19:14
-
-
Save cecileane/d8a7e9e4d4b4fadeb5c62e89506cf3e9 to your computer and use it in GitHub Desktop.
Julia script to concatenate a bunch of alignments in phylip format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# concatenate several phylip alignments | |
# asssumptions: | |
# - the current directory contains all the alignments | |
# (uncomment and modify the first line of code otherwise) | |
# - alignments are those files ending with ".phy" | |
# (beware: after running the script, the concatenated file will also end | |
# in .phy and will be in same folder: do don't re-run the script) | |
# - in each alignment, first line = header (no blank line before) | |
# - each taxon is sampled across all alignments: | |
# taxa in the first alignment but missing from another alignment | |
# will trigger an error at the very end | |
# - the resulting alignment is saved in file "full.phy". | |
# (modify the value of "outfile" for a different name) | |
# cd("path/to/directory/with/phylip/files") | |
files = filter!(x -> occursin(r"\.phy$",x), readdir()); | |
outfile = "full.phy" | |
ngenes = length(files) | |
ngenes>0 || error("I didn't find any file in the current folder") | |
println("found $ngenes files") | |
phyDict = Dict{String,Vector{String}}() # will map each taxon to its list of sequences | |
taxa1 = String[] # taxa in the order in which they come in first file | |
filecounter = 0 | |
rx = r"\s*(\w+)\s+(\S+)\s*$"; # regular expression to extract taxon name & sequence | |
for file in files | |
# global taxa1 # for testing, when run in the REPL | |
global filecounter += 1 | |
open(file) do f # open for reading by default | |
readline(f) # read & discard header: "numTaxa numSites" | |
counter=0 | |
for line in eachline(f) | |
counter += 1 | |
m = match(rx, line) # strips away leading & trailing spaces | |
m != nothing || continue # to next line. accommodates blank or weird lines | |
taxon = m.captures[1] | |
if !haskey(phyDict, taxon) | |
if filecounter==1 | |
push!(taxa1, taxon) | |
phyDict[taxon] = String[] # starting an empty list of sequences | |
else | |
@warn "taxon $taxon in alignment $filecounter but not in alignment 1: will be ignored" | |
end | |
end | |
push!(phyDict[taxon], m.captures[2]) # DNA sequence from that file | |
# not handled here: taxon in file 1 but not in file i>1 | |
# counter<2 || break # for testing | |
end | |
end | |
end | |
totallength = unique(sum(length.(v)) for v in values(phyDict)); | |
length(totallength) == 1 || | |
error("concatenated sequences have variable lengths: $totallength") | |
totallength = totallength[1] | |
open(outfile, "w") do f | |
println(f, length(phyDict), " ", totallength) | |
for tax in taxa1 # same order as in file 1 | |
write(f, tax, " ", string(phyDict[tax]...), "\n") | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is to share with @nkarimi.
@coraallencoleman: defining the regular expression as a variable and re-using it within the loop sped things up a great great deal.
For 1183 loci and 27 sequences, (3.7 million sites total), the main
for
loop takes about 2 seconds (including compilation time).Note that it takes a bit longer if the regular expression was used like this on line 35:
After saving the regular expression object and using it repeatedly:
the
for
loop is a bit faster.Concatenating the sequences one after another would slow down this script significantly. Instead, they are all saved them in a vector, and they are concatenated only once, all together, at the end via