|
#!/usr/bin/env python |
|
|
|
import os, csv, argparse, re |
|
|
|
from Bio.Seq import Seq |
|
from Bio.SeqRecord import SeqRecord |
|
from Bio import SeqIO |
|
|
|
def get_acc(identifier): |
|
""""Given this identifier string, return the unique part lookup for this. |
|
|
|
e.g. "QRE.30L_78578856_seqs1;size=1" -> "QRE.30L_78578856" |
|
""" |
|
parts = identifier.split("_") |
|
return "%s_%s"%(parts[0],parts[1]) |
|
|
|
parser = argparse.ArgumentParser(description='Convert clustered OTUs into Fasta linking seq IDs.') |
|
parser.add_argument('-l',"--lsu", default="ITSx_LSU-all.fasta", |
|
help='ITSx LSU INPUT file in Fasta') |
|
parser.add_argument('-s',"--ssu", default="ITSx_SSU-all.fasta", |
|
help='ITSx SSU INPUT file in Fasta') |
|
parser.add_argument('--otumap', default="all_its_concatenated.good_derep_min1rad5.otumap.txt", |
|
help="USEARCH OTU map table - two column, tab-separated file seqID and OTU-number") |
|
|
|
parser.add_argument('--outlsu', default="LSU_OTU.fasta", |
|
help="Out LSU sequences with OTU names in Fasta") |
|
parser.add_argument('--outssu', default="SSU_OTU.fasta", |
|
help="Out SSU sequences with OTU names in Fasta") |
|
|
|
args = parser.parse_args() |
|
|
|
LSUidx = SeqIO.index_db('LSU.idx',args.lsu, "fasta",key_function=get_acc) |
|
SSUidx = SeqIO.index_db('SSU.idx',args.ssu, "fasta",key_function=get_acc) |
|
|
|
OTUs = { } |
|
with open(args.otumap,"r") as otumap: |
|
otuname = csv.reader(otumap, delimiter="\t") |
|
for row in otuname: |
|
longid = row[0] |
|
parts = longid.split("_") |
|
# get rid of the _seqs=470;size=470 part |
|
seqid = "%s_%s"%(parts[0], parts[1]) |
|
OTU = row[1] |
|
|
|
if OTU in OTUs: |
|
# print("taking first OTU for %s"%(OTU)) |
|
continue |
|
SSUseq = "" |
|
LSUseq = "" |
|
if seqid in SSUidx: |
|
SSUseq = SSUidx[seqid] |
|
else: |
|
print("cannot find seqid %s in SSUdb"%(seqid)) |
|
continue |
|
|
|
if seqid in LSUidx: |
|
LSUseq = LSUidx[seqid] |
|
else: |
|
print("cannot find seqid %s in LSUdb"%(seqid)) |
|
continue |
|
ssuid = re.sub(r'_seqs\S+','',SSUseq.id) # strip _seqs=470;size=470 from ID |
|
lsuid = re.sub(r'_seqs\S+','',LSUseq.id) |
|
OTUs[OTU] = { 'SSU': SeqRecord(SSUseq.seq,id=OTU,name=OTU,description=ssuid), |
|
'LSU': SeqRecord(LSUseq.seq,id=OTU,name=OTU,description=lsuid), |
|
} |
|
|
|
SSU_seqs = [] |
|
LSU_seqs = [] |
|
for otu in sorted(OTUs.keys(), key = lambda kv: int(kv.split("_")[1])): |
|
if 'LSU' in OTUs[otu] and 'SSU' in OTUs[otu]: |
|
SSU_seqs.append(OTUs[otu]['SSU']) |
|
LSU_seqs.append(OTUs[otu]['LSU']) |
|
else: |
|
print(OTUs[otu]) |
|
# expect this never to be true given the steps in the loop above which create |
|
# an entry in the dictionary with both SSU and LSU at same time |
|
print("cannot find an LSU or SSU for %s"%(otu)) |
|
continue |
|
|
|
SeqIO.write(SSU_seqs, args.outssu, "fasta") |
|
SeqIO.write(LSU_seqs, args.outlsu, "fasta") |