Skip to content

Instantly share code, notes, and snippets.

@def-
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save def-/c6f3ae893b935464dab0 to your computer and use it in GitHub Desktop.
Save def-/c6f3ae893b935464dab0 to your computer and use it in GitHub Desktop.
Fasta_Pairs
# 0.19 seconds, 79 MB/s
proc fgets(c: cstring, n: int, f: TFile): cstring {.
importc: "fgets", header: "<stdio.h>", tags: [FReadIO].}
proc myReadLine(f: TFile, line: var TaintedString): bool =
var buf {.noinit.}: array[8192, char]
setLen(line.string, 0)
result = true
while true:
if fgets(buf, 8192, f) == nil:
result = false
break
if buf[cstring(buf).len-1] == '\l':
buf[cstring(buf).len-1] = '\0'
add(line, cstring(buf))
break
add(line, cstring(buf))
iterator fasta_pairs*(f: TFile): tuple[header: string, sequence: string] =
#var sequence = newStringOfCap(15_000_000) # If you know the approximate size beforehand
var sequence = ""
var header = ""
var seq_index = -1
var line = ""
while f.myReadLine(line):
if line[0] == '>':
if seq_index >= 0:
yield (header, sequence)
header = line
sequence.setLen(0)
seq_index += 1
else:
sequence.add(line)
yield (header, sequence)
var f = open("sequence.fasta")
for i in fasta_pairs(f):
echo i[0].len, " ", i[1].len
# 0.09 seconds, 167 MB/s
proc handleFasta(header, sequence) =
echo header.len, " ", sequence.len
var f = system.open("sequence.fasta")
const bufSize = 8192
var
size = bufSize
buf {.noinit.}: array[bufSize, char]
sequence = ""
header = ""
afterNewLine = true
afterNewSeq = false
while size == bufSize:
size = f.readBuffer(addr buf, bufSize)
var lastPos = 0
for i in 0 .. <size:
if buf[i] == '\l':
afterNewLine = true
if afterNewSeq:
afterNewSeq = false
header.setLen(i - lastPos)
copyMem(addr header[0], addr buf[lastPos], i - lastPos)
else:
let oldLen = sequence.len
sequence.setLen(oldLen + i - lastPos)
copyMem(addr sequence[oldLen], addr buf[lastPos], i - lastPos)
lastPos = i+1
elif afterNewLine and buf[i] == '>':
afterNewLine = false
afterNewSeq = true
lastPos = i
if header != "":
handleFasta(header, sequence)
header.setLen(0)
sequence.setLen(0)
let oldLen = sequence.len
sequence.setLen(oldLen + size - lastPos)
copyMem(addr sequence[oldLen], addr buf[lastPos], size - lastPos)
handleFasta(header, sequence)
# Ok, here's the really fast one, doesn't even create the sequence string,
# just splits a file into multiple sequences, which you can access over f.mem
# with the accompanying start and length. Also contains newlines, which makes
# this unfit for most usecases I guess.
# 0.03 seconds, 500 MB/s
import memfiles
var
f = memfiles.open("sequence.fasta")
fastas: seq[tuple[header: string; seqStart, seqLen: int]] = @[]
afterNewLine = true
afterNewSeq = false
header = ""
lastPos = 0
i = 0
seqStart = 0
for c in cast[cstring](f.mem):
if c == '\l':
if afterNewSeq:
header.setLen(i - lastPos - 1)
copyMem(addr header[0], cast[pointer](cast[int](f.mem) + lastPos + 1), i - lastPos - 1)
afterNewSeq = false
seqStart = i + 1
afterNewLine = true
elif c == '>':
if seqStart > 0:
fastas.add((header, seqStart, i - seqStart - 1))
afterNewLine = false
afterNewSeq = true
else:
afterNewLine = false
inc(i)
fastas.add((header, seqStart, i - seqStart - 1))
echo fastas
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment