esote/gotranseq.go

## gotranseq.go
package transeq

import (
	"bufio"
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	"io"
	"sync"

	"github.com/feliixx/gotranseq/ncbicode"
)

// Options struct to store command line args
type Options struct {
	Required `group:"required"`
	Optional `group:"optional"`
	General  `group:"general"`
}

// Required struct to store required command line args
type Required struct {
	Sequence string `short:"s" long:"sequence" value-name:"<filename>" description:"Nucleotide sequence(s) filename"`
	Outseq   string `short:"o" long:"outseq" value-name:"<filename>" description:"Protein sequence filename"`
}

// Optional struct to store required command line args
type Optional struct {
	Frame       string `short:"f" long:"frame" value-name:"<code>" description:"Frame to translate. Possible values:\n  [1, 2, 3, F, -1, -2, -3, R, 6]\n F: forward three frames\n R: reverse three frames\n 6: all 6 frames\n" default:"1"`
	Table       int    `short:"t" long:"table" value-name:"<code>" description:"NCBI code to use, see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG1 for details. Available codes: \n 0: Standard code\n 2: The Vertebrate Mitochondrial Code\n 3: The Yeast Mitochondrial Code\n 4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n 5: The Invertebrate Mitochondrial Code\n 6: The Ciliate, Dasycladacean and Hexamita Nuclear Code\n 9: The Echinoderm and Flatworm Mitochondrial Code\n 10: The Euplotid Nuclear Code\n 11: The Bacterial, Archaeal and Plant Plastid Code\n 12: The Alternative Yeast Nuclear Code\n 13: The Ascidian Mitochondrial Code\n 14: The Alternative Flatworm Mitochondrial Code\n16: Chlorophycean Mitochondrial Code\n 21: Trematode Mitochondrial Code\n22: Scenedesmus obliquus Mitochondrial Code\n 23: Thraustochytrium Mitochondrial Code\n 24: Pterobranchia Mitochondrial Code\n 25: Candidate Division SR1 and Gracilibacteria Code\n 26: Pachysolen tannophilus Nuclear Code\n 29: Mesodinium Nuclear\n 30: Peritrich Nuclear\n" default:"0"`
	Clean       bool   `short:"c" long:"clean" description:"Replace stop codon '*' by 'X'"`
	Alternative bool   `short:"a" long:"alternative" description:"Define frame '-1' as using the set of codons starting with the last codon of the sequence"`
	Trim        bool   `short:"T" long:"trim" description:"Removes all 'X' and '*' characters from the right end of the translation. The trimming process starts at the end and continues until the next character is not a 'X' or a '*'"`
	NumWorker   int    `short:"n" long:"numcpu" value-name:"<n>" description:"Number of threads to use, default is number of CPU"`
}

// General struct to store required command line args
type General struct {
	Help    bool `short:"h" long:"help" description:"Show this help message"`
	Version bool `short:"v" long:"version" description:"Print the tool version and exit"`
}

const (
	nCode = iota
	aCode
	cCode
	tCode
	uCode
	gCode

	// Length of the array to store code/bytes
	// uses gCode because it's the biggest of all codes
	codesSize = (gCode | gCode<<8 | gCode<<16) + 1

	// size of the buffer for writing to file
	maxBufferSize = 1024 * 1024 * 30
	// max line size for sequence
	maxLineSize = 60
	// suffixes ta add to sequence id for each frame
	suffixes = "123456"
)

var letterCode = map[byte]uint8{
	'A': aCode,
	'C': cCode,
	'T': tCode,
	'G': gCode,
	'N': nCode,
	'U': uCode,
}

// create the code map according to the selected table code
func createCodes(code int, clean bool) ([]byte, error) {
	resultMap := map[uint32]byte{}
	twoLetterMap := map[string][]byte{}

	tmpCode := make([]uint8, 4)

	codeMap, err := ncbicode.LoadTableCode(code)
	if err != nil {
		return nil, err
	}

	for codon, aaCode := range codeMap {
		// generate 3 letter code
		tmpCode[0] = letterCode[codon[0]]
		tmpCode[1] = letterCode[codon[1]]
		tmpCode[2] = letterCode[codon[2]]

		// each codon is represented by an unique uint32:
		// each possible nucleotide is represented by an uint8 (255 possibility)
		// the three first bytes are the the code for each nucleotide
		// last byte is unused ( eq to uint8(0) )
		// example:
		// codon 'ACG' ==> uint32(aCode) | uint32(cCode)<<8 | uint32(gCode)<<16
		uint32Code := uint32(tmpCode[0]) | uint32(tmpCode[1])<<8 | uint32(tmpCode[2])<<16
		resultMap[uint32Code] = aaCode

		// generate 2 letter code
		if codes, ok := twoLetterMap[codon[0:2]]; ok {
			twoLetterMap[codon[0:2]] = append(codes, aaCode)
		} else {
			twoLetterMap[codon[0:2]] = []byte{aaCode}
		}
	}

	for twoLetterCodon, codes := range twoLetterMap {
		uniqueAA := true

		for _, c := range codes {
			if c != codes[0] {
				uniqueAA = false
				break
			}
		}

		if uniqueAA {
			first := letterCode[twoLetterCodon[0]]
			second := letterCode[twoLetterCodon[1]]

			uint32Code := uint32(first) | uint32(second)<<8
			resultMap[uint32Code] = codes[0]
		}
	}

	// if clean is specified, we want to replace all '*' by 'X' in the output
	// sequence, so replace all occurrences of '*' directly in the ref map
	if clean {
		for k, v := range resultMap {
			if v == stopByte {
				resultMap[k] = unknown
			}
		}
	}

	r := make([]byte, codesSize)

	for k, v := range resultMap {
		r[k] = v
	}

	return r, nil
}

func computeFrames(frameName string) ([]int, bool, error) {
	frames := make([]int, 6)
	reverse := false

	switch frameName {
	case "1":
		frames[0] = 1
	case "2":
		frames[1] = 1
	case "3":
		frames[2] = 1
	case "F":
		frames[0] = 1
		frames[1] = 1
		frames[2] = 1
	case "-1":
		frames[3] = 1
		reverse = true
	case "-2":
		frames[4] = 1
		reverse = true
	case "-3":
		frames[5] = 1
		reverse = true
	case "R":
		frames[3] = 1
		frames[4] = 1
		frames[5] = 1
		reverse = true
	case "6":
		for i := range frames {
			frames[i] = 1
		}
		reverse = true
	default:
		return frames, reverse, fmt.Errorf("wrong value for -f | --frame parameter: %s", frameName)
	}

	return frames, reverse, nil
}

func writeOrCancel(w writer, out io.Writer, errs chan error,
	cancel context.CancelFunc) {
	if _, err := out.Write(w.buf.Bytes()); err != nil {
		select {
		case errs <- fmt.Errorf("fail to write to output file: %v", err):
		default:
		}
		cancel()
		return
	}
}

func getComplexityAndAlternate(starts []int, frames []int,
	frameIndex int, sequence encodedSequence, idSize int, w writer,
	codes []byte, nuclSeqLength int, options Options, reverse bool) {
	for _, startPos := range starts {
		if frames[frameIndex] == 0 {
			frameIndex++
			continue
		}

		// sequence id should look like
		// >sequenceID_<frame> comment
		idEnd := bytes.IndexByte(sequence[4:idSize], ' ')
		if idEnd != -1 {
			w.buf.Write(sequence[4 : 4+idEnd])
			w.buf.WriteByte('_')
			w.buf.WriteByte(suffixes[frameIndex])
			w.buf.Write(sequence[4+idEnd : idSize])
		} else {
			w.buf.Write(sequence[4:idSize])
			w.buf.WriteByte('_')
			w.buf.WriteByte(suffixes[frameIndex])
		}

		w.newLine()

		// if in trim mode, nb of bytes to trim (nb of successive 'X', '*' and '\n'
		// from right end of the sequence)
		w.toTrim = 0
		w.clen = 0

		// read the sequence 3 letters at a time, starting at a specific position
		// corresponding to the frame
		for pos := startPos + 2 + idSize; pos < len(sequence); pos += 3 {
			if w.clen == maxLineSize {
				w.newLine()
			}

			// create an uint32 from the codon, to retrieve the corresponding
			// AA from the map
			codonCode := uint32(sequence[pos-2]) | uint32(sequence[pos-1])<<8 | uint32(sequence[pos])<<16

			if b := codes[codonCode]; b != byte(0) {
				w.addByte(b)
			} else {
				w.addUnknown()
			}
		}

		// the last codon is only 2 nucleotid long, try to guess
		// the corresponding AA
		if (nuclSeqLength-startPos)%3 == 2 {
			if w.clen == maxLineSize {
				w.newLine()
			}

			codonCode := uint32(sequence[len(sequence)-2]) | uint32(sequence[len(sequence)-1])<<8

			if b := codes[codonCode]; b == byte(0) {

				w.addUnknown()

			} else {
				w.addByte(b)
			}
		}

		// the last codon is only 1 nucleotid long, no way to guess
		// the corresponding AA
		if (nuclSeqLength-startPos)%3 == 1 {
			if w.clen == maxLineSize {
				w.newLine()
			}
			w.addUnknown()
		}

		if options.Trim && w.toTrim > 0 {
			// remove the last bytesToTrim bytes of the buffer
			// as they are 'X', '*' or '\n'
			w.buf.Truncate(w.buf.Len() - w.toTrim)
			w.clen -= w.toTrim
		}

		if w.clen != 0 {
			w.newLine()
		}

		frameIndex++
	}

	if reverse && frameIndex < 6 {
		// get the complementary sequence.
		// Basically, switch
		//   A <-> T
		//   C <-> G
		// N is not modified
		for i, n := range sequence[idSize:] {
			switch n {
			case aCode:
				sequence[i+idSize] = tCode
			case tCode:
				// handle both tCode and uCode
				sequence[i+idSize] = aCode
			case cCode:
				sequence[i+idSize] = gCode
			case gCode:
				sequence[i+idSize] = cCode
			}
		}

		// reverse the sequence
		for i, j := idSize, len(sequence)-1; i < j; i, j = i+1, j-1 {
			sequence[i], sequence[j] = sequence[j], sequence[i]
		}

		if !options.Alternative {
			// Staden convention: Frame -1 is the reverse-complement of the sequence
			// having the same codon phase as frame 1. Frame -2 is the same phase as
			// frame 2. Frame -3 is the same phase as frame 3
			//
			// use the matrix to keep track of the forward frame as it depends on the
			// length of the sequence
			switch nuclSeqLength % 3 {
			case 0:
				starts = []int{0, 2, 1}
			case 1:
				starts = []int{1, 0, 2}
			case 2:
				starts = []int{2, 1, 0}
			}
		}
		// run the same loop, but with the reverse-complemented sequence
		getComplexityAndAlternate(starts, frames, frameIndex, sequence, idSize,
			w, codes, nuclSeqLength, options, reverse)
	}
}

func worker(wg *sync.WaitGroup, fnaSeqs chan encodedSequence,
	ctx context.Context, frames []int, codes []byte,
	options Options, reverse bool, out io.Writer, errs chan error,
	cancel context.CancelFunc) {
	defer wg.Done()

	w := &writer{
		buf: bytes.NewBuffer(nil),
	}

	for s := range fnaSeqs {
		select {
		case <-ctx.Done():
			return
		default:
		}

		starts := []int{0, 1, 2}

		idSize := int(binary.LittleEndian.Uint32(s[0:4]))
		nuclSeqLength := len(s) - idSize

		getComplexityAndAlternate(starts, frames, 0,
			s, idSize, *w, codes, nuclSeqLength, options, reverse)

		if w.buf.Len() > maxBufferSize {
			writeOrCancel(*w, out, errs, cancel)
			w.buf.Reset()
		}

		pool.Put(s)
	}

	if w.buf.Len() > 0 {
		writeOrCancel(*w, out, errs, cancel)
	}
}

// Translate read a fata file, translate each sequence to the corresponding prot sequence in the specified frame
func Translate(inputSequence io.Reader, out io.Writer, options Options) error {
	codes, err := createCodes(options.Table, options.Clean)

	if err != nil {
		return err
	}

	frames, reverse, err := computeFrames(options.Frame)

	if err != nil {
		return err
	}

	fnaSeqs := make(chan encodedSequence, 10)
	errs := make(chan error, 1)

	ctx, cancel := context.WithCancel(context.Background())

	defer cancel()

	var wg sync.WaitGroup
	wg.Add(options.NumWorker)

	for nWorker := 0; nWorker < options.NumWorker; nWorker++ {
		go func() {
			worker(&wg, fnaSeqs, ctx, frames, codes, options,
				reverse, out, errs, cancel)
		}()
	}

	readSequenceFromFasta(ctx, inputSequence, fnaSeqs)

	wg.Wait()

	select {
	case err, ok := <-errs:
		if ok {
			return err
		}
	default:
	}

	return nil
}

func readSequenceFromFasta(ctx context.Context, inputSequence io.Reader,
	fnaSeqs chan encodedSequence) {

	feeder := &fastaChannelFeeder{
		idBuffer:       bytes.NewBuffer(nil),
		commentBuffer:  bytes.NewBuffer(nil),
		sequenceBuffer: bytes.NewBuffer(nil),
		fastaChan:      fnaSeqs,
	}
	// fasta format is:
	//
	// >sequenceID some comments on sequence
	// ACAGGCAGAGACACGACAGACGACGACACAGGAGCAGACAGCAGCAGACGACCACATATT
	// TTTGCGGTCACATGACGACTTCGGCAGCGA
	//
	// see https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp
	// section 1 for details
	scanner := bufio.NewScanner(inputSequence)
Loop:
	for scanner.Scan() {

		line := scanner.Bytes()

		if len(line) == 0 {
			continue
		}

		if line[0] == '>' {
			if feeder.idBuffer.Len() > 0 {
				select {
				case <-ctx.Done():
					break Loop
				default:
				}

				feeder.sendFasta()
			}

			feeder.reset()

			// parse the ID of the sequence. ID is formatted like this:
			// >sequenceID comments
			seqID := bytes.SplitN(line, []byte{' '}, 2)
			feeder.idBuffer.Write(seqID[0])

			if len(seqID) > 1 {
				feeder.commentBuffer.WriteByte(' ')
				feeder.commentBuffer.Write(seqID[1])
			}
		} else {
			// if the line doesn't start with '>', then it's a part of the
			// nucleotide sequence, so write it to the buffer
			feeder.sequenceBuffer.Write(line)
		}
	}
	// don't forget to push last sequence
	select {
	case <-ctx.Done():
	default:
		feeder.sendFasta()
	}

	close(fnaSeqs)
}

// a type to hold an encoded fasta sequence
//
//	s[0:4] stores the size of the sequence id + the size of the comment as an uint32 (little endian)
//  s[4:idSize] stores the sequence id, and the comment id there is one
//  s[idSize:] stores the nucl sequence
type encodedSequence []byte

var pool = sync.Pool{
	New: func() interface{} {
		return make(encodedSequence, 512)
	},
}

func getSizedSlice(idSize, requiredSize int) encodedSequence {
	s := pool.Get().(encodedSequence)
	binary.LittleEndian.PutUint32(s[0:4], uint32(idSize))

	for len(s) < requiredSize {
		s = append(s, byte(0))
	}
	return s[0:requiredSize]
}

func (f *fastaChannelFeeder) sendFasta() {
	idSize := 4 + f.idBuffer.Len() + f.commentBuffer.Len()
	requiredSize := idSize + f.sequenceBuffer.Len()

	s := getSizedSlice(idSize, requiredSize)

	if f.commentBuffer.Len() > 0 {
		copy(s[idSize-f.commentBuffer.Len():idSize], f.commentBuffer.Bytes())
	}

	copy(s[4:4+f.idBuffer.Len()], f.idBuffer.Bytes())

	// convert the sequence of bytes to an array of uint8 codes,
	// so a codon (3 nucleotides | 3 bytes ) can be represented
	// as an uint32
	for i, b := range f.sequenceBuffer.Bytes() {
		switch b {
		case 'A':
			s[i+idSize] = aCode
		case 'C':
			s[i+idSize] = cCode
		case 'G':
			s[i+idSize] = gCode
		case 'T', 'U':
			s[i+idSize] = tCode
		case 'N':
			s[i+idSize] = nCode
		default:
			fmt.Printf("WARNING: invalid char in sequence %s: %s, ignoring", s[4:4+idSize], string(b))
		}
	}

	f.fastaChan <- s
}

type fastaChannelFeeder struct {
	idBuffer       *bytes.Buffer
	commentBuffer  *bytes.Buffer
	sequenceBuffer *bytes.Buffer
	fastaChan      chan encodedSequence
}

func (f *fastaChannelFeeder) reset() {
	f.idBuffer.Reset()
	f.sequenceBuffer.Reset()
	f.commentBuffer.Reset()
}

## writer.go
package transeq

import "bytes"

const (
	stopByte = '*'
	unknown  = 'X'
)

type writer struct {
	buf    *bytes.Buffer
	clen   int
	toTrim int
}

func newWriter(buf []byte) *writer {
	return &writer{buf: bytes.NewBuffer(buf)}
}

func (w *writer) addByte(b byte) {
	w.buf.WriteByte(b)
	w.clen++

	if b == stopByte || b == unknown {
		w.toTrim++
	} else {
		w.toTrim = 0
	}
}

func (w *writer) addUnknown() {
	w.buf.WriteByte(unknown)
	w.clen++
	w.toTrim++
}

func (w *writer) newLine() {
	w.buf.WriteByte('\n')
	w.clen = 0
	w.toTrim++
}
	package transeq

	import (
	"bufio"
	"bytes"
	"context"
	"encoding/binary"
	"fmt"
	"io"
	"sync"

	"github.com/feliixx/gotranseq/ncbicode"
	)

	// Options struct to store command line args
	type Options struct {
	Required `group:"required"`
	Optional `group:"optional"`
	General `group:"general"`
	}

	// Required struct to store required command line args
	type Required struct {
	Sequence string `short:"s" long:"sequence" value-name:"<filename>" description:"Nucleotide sequence(s) filename"`
	Outseq string `short:"o" long:"outseq" value-name:"<filename>" description:"Protein sequence filename"`
	}

	// Optional struct to store required command line args
	type Optional struct {
	Frame string `short:"f" long:"frame" value-name:"<code>" description:"Frame to translate. Possible values:\n [1, 2, 3, F, -1, -2, -3, R, 6]\n F: forward three frames\n R: reverse three frames\n 6: all 6 frames\n" default:"1"`
	Table int `short:"t" long:"table" value-name:"<code>" description:"NCBI code to use, see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG1 for details. Available codes: \n 0: Standard code\n 2: The Vertebrate Mitochondrial Code\n 3: The Yeast Mitochondrial Code\n 4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n 5: The Invertebrate Mitochondrial Code\n 6: The Ciliate, Dasycladacean and Hexamita Nuclear Code\n 9: The Echinoderm and Flatworm Mitochondrial Code\n 10: The Euplotid Nuclear Code\n 11: The Bacterial, Archaeal and Plant Plastid Code\n 12: The Alternative Yeast Nuclear Code\n 13: The Ascidian Mitochondrial Code\n 14: The Alternative Flatworm Mitochondrial Code\n16: Chlorophycean Mitochondrial Code\n 21: Trematode Mitochondrial Code\n22: Scenedesmus obliquus Mitochondrial Code\n 23: Thraustochytrium Mitochondrial Code\n 24: Pterobranchia Mitochondrial Code\n 25: Candidate Division SR1 and Gracilibacteria Code\n 26: Pachysolen tannophilus Nuclear Code\n 29: Mesodinium Nuclear\n 30: Peritrich Nuclear\n" default:"0"`
	Clean bool `short:"c" long:"clean" description:"Replace stop codon '*' by 'X'"`
	Alternative bool `short:"a" long:"alternative" description:"Define frame '-1' as using the set of codons starting with the last codon of the sequence"`
	Trim bool `short:"T" long:"trim" description:"Removes all 'X' and '' characters from the right end of the translation. The trimming process starts at the end and continues until the next character is not a 'X' or a ''"`
	NumWorker int `short:"n" long:"numcpu" value-name:"<n>" description:"Number of threads to use, default is number of CPU"`
	}

	// General struct to store required command line args
	type General struct {
	Help bool `short:"h" long:"help" description:"Show this help message"`
	Version bool `short:"v" long:"version" description:"Print the tool version and exit"`
	}

	const (
	nCode = iota
	aCode
	cCode
	tCode
	uCode
	gCode

	// Length of the array to store code/bytes
	// uses gCode because it's the biggest of all codes
	codesSize = (gCode \| gCode<<8 \| gCode<<16) + 1

	// size of the buffer for writing to file
	maxBufferSize = 1024 * 1024 * 30
	// max line size for sequence
	maxLineSize = 60
	// suffixes ta add to sequence id for each frame
	suffixes = "123456"
	)

	var letterCode = map[byte]uint8{
	'A': aCode,
	'C': cCode,
	'T': tCode,
	'G': gCode,
	'N': nCode,
	'U': uCode,
	}

	// create the code map according to the selected table code
	func createCodes(code int, clean bool) ([]byte, error) {
	resultMap := map[uint32]byte{}
	twoLetterMap := map[string][]byte{}

	tmpCode := make([]uint8, 4)

	codeMap, err := ncbicode.LoadTableCode(code)
	if err != nil {
	return nil, err
	}

	for codon, aaCode := range codeMap {
	// generate 3 letter code
	tmpCode[0] = letterCode[codon[0]]
	tmpCode[1] = letterCode[codon[1]]
	tmpCode[2] = letterCode[codon[2]]

	// each codon is represented by an unique uint32:
	// each possible nucleotide is represented by an uint8 (255 possibility)
	// the three first bytes are the the code for each nucleotide
	// last byte is unused ( eq to uint8(0) )
	// example:
	// codon 'ACG' ==> uint32(aCode) \| uint32(cCode)<<8 \| uint32(gCode)<<16
	uint32Code := uint32(tmpCode[0]) \| uint32(tmpCode[1])<<8 \| uint32(tmpCode[2])<<16
	resultMap[uint32Code] = aaCode

	// generate 2 letter code
	if codes, ok := twoLetterMap[codon[0:2]]; ok {
	twoLetterMap[codon[0:2]] = append(codes, aaCode)
	} else {
	twoLetterMap[codon[0:2]] = []byte{aaCode}
	}
	}

	for twoLetterCodon, codes := range twoLetterMap {
	uniqueAA := true

	for _, c := range codes {
	if c != codes[0] {
	uniqueAA = false
	break
	}
	}

	if uniqueAA {
	first := letterCode[twoLetterCodon[0]]
	second := letterCode[twoLetterCodon[1]]

	uint32Code := uint32(first) \| uint32(second)<<8
	resultMap[uint32Code] = codes[0]
	}
	}

	// if clean is specified, we want to replace all '*' by 'X' in the output
	// sequence, so replace all occurrences of '*' directly in the ref map
	if clean {
	for k, v := range resultMap {
	if v == stopByte {
	resultMap[k] = unknown
	}
	}
	}

	r := make([]byte, codesSize)

	for k, v := range resultMap {
	r[k] = v
	}

	return r, nil
	}

	func computeFrames(frameName string) ([]int, bool, error) {
	frames := make([]int, 6)
	reverse := false

	switch frameName {
	case "1":
	frames[0] = 1
	case "2":
	frames[1] = 1
	case "3":
	frames[2] = 1
	case "F":
	frames[0] = 1
	frames[1] = 1
	frames[2] = 1
	case "-1":
	frames[3] = 1
	reverse = true
	case "-2":
	frames[4] = 1
	reverse = true
	case "-3":
	frames[5] = 1
	reverse = true
	case "R":
	frames[3] = 1
	frames[4] = 1
	frames[5] = 1
	reverse = true
	case "6":
	for i := range frames {
	frames[i] = 1
	}
	reverse = true
	default:
	return frames, reverse, fmt.Errorf("wrong value for -f \| --frame parameter: %s", frameName)
	}

	return frames, reverse, nil
	}

	func writeOrCancel(w writer, out io.Writer, errs chan error,
	cancel context.CancelFunc) {
	if _, err := out.Write(w.buf.Bytes()); err != nil {
	select {
	case errs <- fmt.Errorf("fail to write to output file: %v", err):
	default:
	}
	cancel()
	return
	}
	}

	func getComplexityAndAlternate(starts []int, frames []int,
	frameIndex int, sequence encodedSequence, idSize int, w writer,
	codes []byte, nuclSeqLength int, options Options, reverse bool) {
	for _, startPos := range starts {
	if frames[frameIndex] == 0 {
	frameIndex++
	continue
	}

	// sequence id should look like
	// >sequenceID_<frame> comment
	idEnd := bytes.IndexByte(sequence[4:idSize], ' ')
	if idEnd != -1 {
	w.buf.Write(sequence[4 : 4+idEnd])
	w.buf.WriteByte('_')
	w.buf.WriteByte(suffixes[frameIndex])
	w.buf.Write(sequence[4+idEnd : idSize])
	} else {
	w.buf.Write(sequence[4:idSize])
	w.buf.WriteByte('_')
	w.buf.WriteByte(suffixes[frameIndex])
	}

	w.newLine()

	// if in trim mode, nb of bytes to trim (nb of successive 'X', '*' and '\n'
	// from right end of the sequence)
	w.toTrim = 0
	w.clen = 0

	// read the sequence 3 letters at a time, starting at a specific position
	// corresponding to the frame
	for pos := startPos + 2 + idSize; pos < len(sequence); pos += 3 {
	if w.clen == maxLineSize {
	w.newLine()
	}

	// create an uint32 from the codon, to retrieve the corresponding
	// AA from the map
	codonCode := uint32(sequence[pos-2]) \| uint32(sequence[pos-1])<<8 \| uint32(sequence[pos])<<16

	if b := codes[codonCode]; b != byte(0) {
	w.addByte(b)
	} else {
	w.addUnknown()
	}
	}

	// the last codon is only 2 nucleotid long, try to guess
	// the corresponding AA
	if (nuclSeqLength-startPos)%3 == 2 {
	if w.clen == maxLineSize {
	w.newLine()
	}

	codonCode := uint32(sequence[len(sequence)-2]) \| uint32(sequence[len(sequence)-1])<<8

	if b := codes[codonCode]; b == byte(0) {

	w.addUnknown()

	} else {
	w.addByte(b)
	}
	}

	// the last codon is only 1 nucleotid long, no way to guess
	// the corresponding AA
	if (nuclSeqLength-startPos)%3 == 1 {
	if w.clen == maxLineSize {
	w.newLine()
	}
	w.addUnknown()
	}

	if options.Trim && w.toTrim > 0 {
	// remove the last bytesToTrim bytes of the buffer
	// as they are 'X', '*' or '\n'
	w.buf.Truncate(w.buf.Len() - w.toTrim)
	w.clen -= w.toTrim
	}

	if w.clen != 0 {
	w.newLine()
	}

	frameIndex++
	}

	if reverse && frameIndex < 6 {
	// get the complementary sequence.
	// Basically, switch
	// A <-> T
	// C <-> G
	// N is not modified
	for i, n := range sequence[idSize:] {
	switch n {
	case aCode:
	sequence[i+idSize] = tCode
	case tCode:
	// handle both tCode and uCode
	sequence[i+idSize] = aCode
	case cCode:
	sequence[i+idSize] = gCode
	case gCode:
	sequence[i+idSize] = cCode
	}
	}

	// reverse the sequence
	for i, j := idSize, len(sequence)-1; i < j; i, j = i+1, j-1 {
	sequence[i], sequence[j] = sequence[j], sequence[i]
	}

	if !options.Alternative {
	// Staden convention: Frame -1 is the reverse-complement of the sequence
	// having the same codon phase as frame 1. Frame -2 is the same phase as
	// frame 2. Frame -3 is the same phase as frame 3
	//
	// use the matrix to keep track of the forward frame as it depends on the
	// length of the sequence
	switch nuclSeqLength % 3 {
	case 0:
	starts = []int{0, 2, 1}
	case 1:
	starts = []int{1, 0, 2}
	case 2:
	starts = []int{2, 1, 0}
	}
	}
	// run the same loop, but with the reverse-complemented sequence
	getComplexityAndAlternate(starts, frames, frameIndex, sequence, idSize,
	w, codes, nuclSeqLength, options, reverse)
	}
	}

	func worker(wg *sync.WaitGroup, fnaSeqs chan encodedSequence,
	ctx context.Context, frames []int, codes []byte,
	options Options, reverse bool, out io.Writer, errs chan error,
	cancel context.CancelFunc) {
	defer wg.Done()

	w := &writer{
	buf: bytes.NewBuffer(nil),
	}

	for s := range fnaSeqs {
	select {
	case <-ctx.Done():
	return
	default:
	}

	starts := []int{0, 1, 2}

	idSize := int(binary.LittleEndian.Uint32(s[0:4]))
	nuclSeqLength := len(s) - idSize

	getComplexityAndAlternate(starts, frames, 0,
	s, idSize, *w, codes, nuclSeqLength, options, reverse)

	if w.buf.Len() > maxBufferSize {
	writeOrCancel(*w, out, errs, cancel)
	w.buf.Reset()
	}

	pool.Put(s)
	}

	if w.buf.Len() > 0 {
	writeOrCancel(*w, out, errs, cancel)
	}
	}

	// Translate read a fata file, translate each sequence to the corresponding prot sequence in the specified frame
	func Translate(inputSequence io.Reader, out io.Writer, options Options) error {
	codes, err := createCodes(options.Table, options.Clean)

	if err != nil {
	return err
	}

	frames, reverse, err := computeFrames(options.Frame)

	if err != nil {
	return err
	}

	fnaSeqs := make(chan encodedSequence, 10)
	errs := make(chan error, 1)

	ctx, cancel := context.WithCancel(context.Background())

	defer cancel()

	var wg sync.WaitGroup
	wg.Add(options.NumWorker)

	for nWorker := 0; nWorker < options.NumWorker; nWorker++ {
	go func() {
	worker(&wg, fnaSeqs, ctx, frames, codes, options,
	reverse, out, errs, cancel)
	}()
	}

	readSequenceFromFasta(ctx, inputSequence, fnaSeqs)

	wg.Wait()

	select {
	case err, ok := <-errs:
	if ok {
	return err
	}
	default:
	}

	return nil
	}

	func readSequenceFromFasta(ctx context.Context, inputSequence io.Reader,
	fnaSeqs chan encodedSequence) {

	feeder := &fastaChannelFeeder{
	idBuffer: bytes.NewBuffer(nil),
	commentBuffer: bytes.NewBuffer(nil),
	sequenceBuffer: bytes.NewBuffer(nil),
	fastaChan: fnaSeqs,
	}
	// fasta format is:
	//
	// >sequenceID some comments on sequence
	// ACAGGCAGAGACACGACAGACGACGACACAGGAGCAGACAGCAGCAGACGACCACATATT
	// TTTGCGGTCACATGACGACTTCGGCAGCGA
	//
	// see https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp
	// section 1 for details
	scanner := bufio.NewScanner(inputSequence)
	Loop:
	for scanner.Scan() {

	line := scanner.Bytes()

	if len(line) == 0 {
	continue
	}

	if line[0] == '>' {
	if feeder.idBuffer.Len() > 0 {
	select {
	case <-ctx.Done():
	break Loop
	default:
	}

	feeder.sendFasta()
	}

	feeder.reset()

	// parse the ID of the sequence. ID is formatted like this:
	// >sequenceID comments
	seqID := bytes.SplitN(line, []byte{' '}, 2)
	feeder.idBuffer.Write(seqID[0])

	if len(seqID) > 1 {
	feeder.commentBuffer.WriteByte(' ')
	feeder.commentBuffer.Write(seqID[1])
	}
	} else {
	// if the line doesn't start with '>', then it's a part of the
	// nucleotide sequence, so write it to the buffer
	feeder.sequenceBuffer.Write(line)
	}
	}
	// don't forget to push last sequence
	select {
	case <-ctx.Done():
	default:
	feeder.sendFasta()
	}

	close(fnaSeqs)
	}

	// a type to hold an encoded fasta sequence
	//
	// s[0:4] stores the size of the sequence id + the size of the comment as an uint32 (little endian)
	// s[4:idSize] stores the sequence id, and the comment id there is one
	// s[idSize:] stores the nucl sequence
	type encodedSequence []byte

	var pool = sync.Pool{
	New: func() interface{} {
	return make(encodedSequence, 512)
	},
	}

	func getSizedSlice(idSize, requiredSize int) encodedSequence {
	s := pool.Get().(encodedSequence)
	binary.LittleEndian.PutUint32(s[0:4], uint32(idSize))

	for len(s) < requiredSize {
	s = append(s, byte(0))
	}
	return s[0:requiredSize]
	}

	func (f *fastaChannelFeeder) sendFasta() {
	idSize := 4 + f.idBuffer.Len() + f.commentBuffer.Len()
	requiredSize := idSize + f.sequenceBuffer.Len()

	s := getSizedSlice(idSize, requiredSize)

	if f.commentBuffer.Len() > 0 {
	copy(s[idSize-f.commentBuffer.Len():idSize], f.commentBuffer.Bytes())
	}

	copy(s[4:4+f.idBuffer.Len()], f.idBuffer.Bytes())

	// convert the sequence of bytes to an array of uint8 codes,
	// so a codon (3 nucleotides \| 3 bytes ) can be represented
	// as an uint32
	for i, b := range f.sequenceBuffer.Bytes() {
	switch b {
	case 'A':
	s[i+idSize] = aCode
	case 'C':
	s[i+idSize] = cCode
	case 'G':
	s[i+idSize] = gCode
	case 'T', 'U':
	s[i+idSize] = tCode
	case 'N':
	s[i+idSize] = nCode
	default:
	fmt.Printf("WARNING: invalid char in sequence %s: %s, ignoring", s[4:4+idSize], string(b))
	}
	}

	f.fastaChan <- s
	}

	type fastaChannelFeeder struct {
	idBuffer *bytes.Buffer
	commentBuffer *bytes.Buffer
	sequenceBuffer *bytes.Buffer
	fastaChan chan encodedSequence
	}

	func (f *fastaChannelFeeder) reset() {
	f.idBuffer.Reset()
	f.sequenceBuffer.Reset()
	f.commentBuffer.Reset()
	}
	package transeq

	import "bytes"

	const (
	stopByte = '*'
	unknown = 'X'
	)

	type writer struct {
	buf *bytes.Buffer
	clen int
	toTrim int
	}

	func newWriter(buf []byte) *writer {
	return &writer{buf: bytes.NewBuffer(buf)}
	}

	func (w *writer) addByte(b byte) {
	w.buf.WriteByte(b)
	w.clen++

	if b == stopByte \|\| b == unknown {
	w.toTrim++
	} else {
	w.toTrim = 0
	}
	}

	func (w *writer) addUnknown() {
	w.buf.WriteByte(unknown)
	w.clen++
	w.toTrim++
	}

	func (w *writer) newLine() {
	w.buf.WriteByte('\n')
	w.clen = 0
	w.toTrim++
	}