Skip to content

Instantly share code, notes, and snippets.

@zachcp
Last active January 2, 2023 23:50
Show Gist options
  • Save zachcp/40a58b426c60fc212155a456d39a9fa2 to your computer and use it in GitHub Desktop.
Save zachcp/40a58b426c60fc212155a456d39a9fa2 to your computer and use it in GitHub Desktop.
Experiment with Cyber for Fasta
run:
./cyber readfq.cy < test.fasta
# 190 Mb
downloadref:
wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/metagenomes/human_skin_metagenome/latest_assembly_versions/GCA_013297495.1_ASM1329749v1/GCA_013297495.1_ASM1329749v1_genomic.fna.gz
gunzip GCA_013297495.1_ASM1329749v1_genomic.fna.gz
test_cy:
time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
test_cy2:
time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
test_py:
time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna
test: test_py test_cy test_cy2
--- attempt to use cyber as a fast scripting language for bio.
--- tried to implement the readfq script.
--- Notes:
--- io can only do single line at a time.
--- as a workaround I created an object to handle the parsing logic.
--- match is not yet available
--- can you type hint fields in the objects?
--- type hints didn't work on in the is_fastx function
--- '@+>' is 64 / 43 / 62
func is_fastx(chr) bool:
if chr == 64:
return true
if chr == 62:
return true
return false
func is_plus(chr) bool:
if chr == 43:
return true
return false
object Seq:
name
sequences
qualities
in_qual
func create():
return Seq{
name: "",
sequences: [],
qualities: [],
in_qual: false
}
func get_seq_len(self):
local_n = 0
for self.sequences as i, it:
local_len = it.len()
local_n += local_len
return local_n
func add_seq(self, seqtoadd):
self.sequences.add(seqtoadd)
func add_quals(self, qualtoadd):
self.qualities.add(qualtoadd)
func dump(self):
print "\n----------\nDumping Seq...."
print self.name
for self.sequences as s:
print s
for self.qualities as s:
print s
print '---------\n\n'
n = 0
slen = 0
qlen = 0
line_count = 0
seq = Seq.create()
for:
line_count += 1
--- print 'line count is {line_count}'
line = readLine()
if line == error(#EndOfStream):
--- print 'end of stream'
n += 1
slen += seq.get_seq_len()
break
-- if its a new fasta line then register the global counts
-- and then start a new record.
if is_fastx(line.charAt(0)):
--- print 'Matched a new sequence!'
if seq.name != "":
n += 1
slen += seq.get_seq_len()
seq = Seq.create()
seq.name = line
continue
else is_plus(line.charAt(0)):
seq.in_qual = true
else:
if seq.in_qual == false:
seq.add_seq(line)
--- print seq.dump()
print 'There are {slen} bases from {n} records in this file.'
def readfq(fp): # this is a generator function
last = None # this is a buffer keeping the last unprocessed line
while True: # mimic closure; is it a bad idea?
if not last: # the first record or a record following a fastq
for l in fp: # search for the start of the next record
if l[0] in '>@': # fasta/q header line
last = l[:-1] # save this line
break
if not last:
break
name, seqs, last = last[1:].partition(" ")[0], [], None
for l in fp: # read the sequence
if l[0] in '@+>':
last = l[:-1]
break
seqs.append(l[:-1])
if not last or last[0] != '+': # this is a fasta record
yield name, ''.join(seqs), None # yield a fasta record
if not last: break
else: # this is a fastq record
seq, leng, seqs = ''.join(seqs), 0, []
for l in fp: # read the quality
seqs.append(l[:-1])
leng += len(l) - 1
if leng >= len(seq): # have read enough quality
last = None
yield name, seq, ''.join(seqs); # yield a fastq record
break
if last: # reach EOF before reading enough quality
yield name, seq, None # yield a fasta record instead
break
if __name__ == "__main__":
import sys
n, slen, qlen = 0, 0, 0
for name, seq, qual in readfq(sys.stdin):
n += 1
slen += len(seq)
qlen += qual and len(qual) or 0
print(f" There are {n} records and {slen} bases")
--- minimal parse. don't use object or fastq
--- '@+>' is 64 / 43 / 62
func is_fastx(chr) bool:
if chr == 64:
return true
if chr == 62:
return true
return false
n = 0
slen = 0
qlen = 0
for:
line = readLine()
if line == error(#EndOfStream):
break
if is_fastx(line.charAt(0)):
n += 1
else:
slen += line.len()
print 'There are {slen} bases from {n} records in this file.'
(base) zcharlop-powers@ZL-12760 cyber-fasta % make test
time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna
There are 341540 records and 161512289 bases
real 0m1.065s
user 0m0.981s
sys 0m0.060s
time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
There are 27323200 bases from 341540 records in this file.
real 2m24.335s
user 0m53.681s
sys 1m28.221s
time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
There are 161512289 bases from 341540 records in this file.
real 2m30.641s
user 0m52.714s
sys 1m28.372s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment