Skip to content

Instantly share code, notes, and snippets.

@hyphaltip
Last active July 26, 2020 22:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hyphaltip/ab38c08a3b2cfa590285029b31b63428 to your computer and use it in GitHub Desktop.
Save hyphaltip/ab38c08a3b2cfa590285029b31b63428 to your computer and use it in GitHub Desktop.
ITSx_cross-ref

Support generating a Fasta file where the IDs are changed to OTUXX instead of the original sequence IDs to match up with what was genereted for a Usearch run - to support linking different molecules together after ITSx splitting of SSU, LSU, and ITS.

Author - Jason.stajich[AT]ucr.edu and Jana U'Ren juren[AT]arizona.edu

usage: ITSx_cross-ref.py [-h] [-l LSU] [-s SSU] [--otumap OTUMAP]
                         [--outlsu OUTLSU] [--outssu OUTSSU]

Convert clustered OTUs into Fasta linking seq IDs.

optional arguments:
  -h, --help         show this help message and exit
  -l LSU, --lsu LSU  ITSx LSU INPUT file in Fasta
  -s SSU, --ssu SSU  ITSx SSU INPUT file in Fasta
  --otumap OTUMAP    USEARCH OTU map table - two column, tab-separated file
                     seqID and OTU-number
  --outlsu OUTLSU    Out LSU sequences with OTU names in Fasta
  --outssu OUTSSU    Out SSU sequences with OTU names in Fasta
#!/usr/bin/env python
import os, csv, argparse, re
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
def get_acc(identifier):
""""Given this identifier string, return the unique part lookup for this.
e.g. "QRE.30L_78578856_seqs1;size=1" -> "QRE.30L_78578856"
"""
parts = identifier.split("_")
return "%s_%s"%(parts[0],parts[1])
parser = argparse.ArgumentParser(description='Convert clustered OTUs into Fasta linking seq IDs.')
parser.add_argument('-l',"--lsu", default="ITSx_LSU-all.fasta",
help='ITSx LSU INPUT file in Fasta')
parser.add_argument('-s',"--ssu", default="ITSx_SSU-all.fasta",
help='ITSx SSU INPUT file in Fasta')
parser.add_argument('--otumap', default="all_its_concatenated.good_derep_min1rad5.otumap.txt",
help="USEARCH OTU map table - two column, tab-separated file seqID and OTU-number")
parser.add_argument('--outlsu', default="LSU_OTU.fasta",
help="Out LSU sequences with OTU names in Fasta")
parser.add_argument('--outssu', default="SSU_OTU.fasta",
help="Out SSU sequences with OTU names in Fasta")
args = parser.parse_args()
LSUidx = SeqIO.index_db('LSU.idx',args.lsu, "fasta",key_function=get_acc)
SSUidx = SeqIO.index_db('SSU.idx',args.ssu, "fasta",key_function=get_acc)
OTUs = { }
with open(args.otumap,"r") as otumap:
otuname = csv.reader(otumap, delimiter="\t")
for row in otuname:
longid = row[0]
parts = longid.split("_")
# get rid of the _seqs=470;size=470 part
seqid = "%s_%s"%(parts[0], parts[1])
OTU = row[1]
if OTU in OTUs:
# print("taking first OTU for %s"%(OTU))
continue
SSUseq = ""
LSUseq = ""
if seqid in SSUidx:
SSUseq = SSUidx[seqid]
else:
print("cannot find seqid %s in SSUdb"%(seqid))
continue
if seqid in LSUidx:
LSUseq = LSUidx[seqid]
else:
print("cannot find seqid %s in LSUdb"%(seqid))
continue
ssuid = re.sub(r'_seqs\S+','',SSUseq.id) # strip _seqs=470;size=470 from ID
lsuid = re.sub(r'_seqs\S+','',LSUseq.id)
OTUs[OTU] = { 'SSU': SeqRecord(SSUseq.seq,id=OTU,name=OTU,description=ssuid),
'LSU': SeqRecord(LSUseq.seq,id=OTU,name=OTU,description=lsuid),
}
SSU_seqs = []
LSU_seqs = []
for otu in sorted(OTUs.keys(), key = lambda kv: int(kv.split("_")[1])):
if 'LSU' in OTUs[otu] and 'SSU' in OTUs[otu]:
SSU_seqs.append(OTUs[otu]['SSU'])
LSU_seqs.append(OTUs[otu]['LSU'])
else:
print(OTUs[otu])
# expect this never to be true given the steps in the loop above which create
# an entry in the dictionary with both SSU and LSU at same time
print("cannot find an LSU or SSU for %s"%(otu))
continue
SeqIO.write(SSU_seqs, args.outssu, "fasta")
SeqIO.write(LSU_seqs, args.outlsu, "fasta")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment