Created
January 8, 2019 07:51
-
-
Save esote/24e6c0e641e1af9502f1c3a0caf03c11 to your computer and use it in GitHub Desktop.
Code Review: https://codereview.stackexchange.com/a/211081/118395
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package transeq | |
import ( | |
"bufio" | |
"bytes" | |
"context" | |
"encoding/binary" | |
"fmt" | |
"io" | |
"sync" | |
"github.com/feliixx/gotranseq/ncbicode" | |
) | |
// Options struct to store command line args | |
type Options struct { | |
Required `group:"required"` | |
Optional `group:"optional"` | |
General `group:"general"` | |
} | |
// Required struct to store required command line args | |
type Required struct { | |
Sequence string `short:"s" long:"sequence" value-name:"<filename>" description:"Nucleotide sequence(s) filename"` | |
Outseq string `short:"o" long:"outseq" value-name:"<filename>" description:"Protein sequence filename"` | |
} | |
// Optional struct to store required command line args | |
type Optional struct { | |
Frame string `short:"f" long:"frame" value-name:"<code>" description:"Frame to translate. Possible values:\n [1, 2, 3, F, -1, -2, -3, R, 6]\n F: forward three frames\n R: reverse three frames\n 6: all 6 frames\n" default:"1"` | |
Table int `short:"t" long:"table" value-name:"<code>" description:"NCBI code to use, see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG1 for details. Available codes: \n 0: Standard code\n 2: The Vertebrate Mitochondrial Code\n 3: The Yeast Mitochondrial Code\n 4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n 5: The Invertebrate Mitochondrial Code\n 6: The Ciliate, Dasycladacean and Hexamita Nuclear Code\n 9: The Echinoderm and Flatworm Mitochondrial Code\n 10: The Euplotid Nuclear Code\n 11: The Bacterial, Archaeal and Plant Plastid Code\n 12: The Alternative Yeast Nuclear Code\n 13: The Ascidian Mitochondrial Code\n 14: The Alternative Flatworm Mitochondrial Code\n16: Chlorophycean Mitochondrial Code\n 21: Trematode Mitochondrial Code\n22: Scenedesmus obliquus Mitochondrial Code\n 23: Thraustochytrium Mitochondrial Code\n 24: Pterobranchia Mitochondrial Code\n 25: Candidate Division SR1 and Gracilibacteria Code\n 26: Pachysolen tannophilus Nuclear Code\n 29: Mesodinium Nuclear\n 30: Peritrich Nuclear\n" default:"0"` | |
Clean bool `short:"c" long:"clean" description:"Replace stop codon '*' by 'X'"` | |
Alternative bool `short:"a" long:"alternative" description:"Define frame '-1' as using the set of codons starting with the last codon of the sequence"` | |
Trim bool `short:"T" long:"trim" description:"Removes all 'X' and '*' characters from the right end of the translation. The trimming process starts at the end and continues until the next character is not a 'X' or a '*'"` | |
NumWorker int `short:"n" long:"numcpu" value-name:"<n>" description:"Number of threads to use, default is number of CPU"` | |
} | |
// General struct to store required command line args | |
type General struct { | |
Help bool `short:"h" long:"help" description:"Show this help message"` | |
Version bool `short:"v" long:"version" description:"Print the tool version and exit"` | |
} | |
const ( | |
nCode = iota | |
aCode | |
cCode | |
tCode | |
uCode | |
gCode | |
// Length of the array to store code/bytes | |
// uses gCode because it's the biggest of all codes | |
codesSize = (gCode | gCode<<8 | gCode<<16) + 1 | |
// size of the buffer for writing to file | |
maxBufferSize = 1024 * 1024 * 30 | |
// max line size for sequence | |
maxLineSize = 60 | |
// suffixes ta add to sequence id for each frame | |
suffixes = "123456" | |
) | |
var letterCode = map[byte]uint8{ | |
'A': aCode, | |
'C': cCode, | |
'T': tCode, | |
'G': gCode, | |
'N': nCode, | |
'U': uCode, | |
} | |
// create the code map according to the selected table code | |
func createCodes(code int, clean bool) ([]byte, error) { | |
resultMap := map[uint32]byte{} | |
twoLetterMap := map[string][]byte{} | |
tmpCode := make([]uint8, 4) | |
codeMap, err := ncbicode.LoadTableCode(code) | |
if err != nil { | |
return nil, err | |
} | |
for codon, aaCode := range codeMap { | |
// generate 3 letter code | |
tmpCode[0] = letterCode[codon[0]] | |
tmpCode[1] = letterCode[codon[1]] | |
tmpCode[2] = letterCode[codon[2]] | |
// each codon is represented by an unique uint32: | |
// each possible nucleotide is represented by an uint8 (255 possibility) | |
// the three first bytes are the the code for each nucleotide | |
// last byte is unused ( eq to uint8(0) ) | |
// example: | |
// codon 'ACG' ==> uint32(aCode) | uint32(cCode)<<8 | uint32(gCode)<<16 | |
uint32Code := uint32(tmpCode[0]) | uint32(tmpCode[1])<<8 | uint32(tmpCode[2])<<16 | |
resultMap[uint32Code] = aaCode | |
// generate 2 letter code | |
if codes, ok := twoLetterMap[codon[0:2]]; ok { | |
twoLetterMap[codon[0:2]] = append(codes, aaCode) | |
} else { | |
twoLetterMap[codon[0:2]] = []byte{aaCode} | |
} | |
} | |
for twoLetterCodon, codes := range twoLetterMap { | |
uniqueAA := true | |
for _, c := range codes { | |
if c != codes[0] { | |
uniqueAA = false | |
break | |
} | |
} | |
if uniqueAA { | |
first := letterCode[twoLetterCodon[0]] | |
second := letterCode[twoLetterCodon[1]] | |
uint32Code := uint32(first) | uint32(second)<<8 | |
resultMap[uint32Code] = codes[0] | |
} | |
} | |
// if clean is specified, we want to replace all '*' by 'X' in the output | |
// sequence, so replace all occurrences of '*' directly in the ref map | |
if clean { | |
for k, v := range resultMap { | |
if v == stopByte { | |
resultMap[k] = unknown | |
} | |
} | |
} | |
r := make([]byte, codesSize) | |
for k, v := range resultMap { | |
r[k] = v | |
} | |
return r, nil | |
} | |
func computeFrames(frameName string) ([]int, bool, error) { | |
frames := make([]int, 6) | |
reverse := false | |
switch frameName { | |
case "1": | |
frames[0] = 1 | |
case "2": | |
frames[1] = 1 | |
case "3": | |
frames[2] = 1 | |
case "F": | |
frames[0] = 1 | |
frames[1] = 1 | |
frames[2] = 1 | |
case "-1": | |
frames[3] = 1 | |
reverse = true | |
case "-2": | |
frames[4] = 1 | |
reverse = true | |
case "-3": | |
frames[5] = 1 | |
reverse = true | |
case "R": | |
frames[3] = 1 | |
frames[4] = 1 | |
frames[5] = 1 | |
reverse = true | |
case "6": | |
for i := range frames { | |
frames[i] = 1 | |
} | |
reverse = true | |
default: | |
return frames, reverse, fmt.Errorf("wrong value for -f | --frame parameter: %s", frameName) | |
} | |
return frames, reverse, nil | |
} | |
func writeOrCancel(w writer, out io.Writer, errs chan error, | |
cancel context.CancelFunc) { | |
if _, err := out.Write(w.buf.Bytes()); err != nil { | |
select { | |
case errs <- fmt.Errorf("fail to write to output file: %v", err): | |
default: | |
} | |
cancel() | |
return | |
} | |
} | |
func getComplexityAndAlternate(starts []int, frames []int, | |
frameIndex int, sequence encodedSequence, idSize int, w writer, | |
codes []byte, nuclSeqLength int, options Options, reverse bool) { | |
for _, startPos := range starts { | |
if frames[frameIndex] == 0 { | |
frameIndex++ | |
continue | |
} | |
// sequence id should look like | |
// >sequenceID_<frame> comment | |
idEnd := bytes.IndexByte(sequence[4:idSize], ' ') | |
if idEnd != -1 { | |
w.buf.Write(sequence[4 : 4+idEnd]) | |
w.buf.WriteByte('_') | |
w.buf.WriteByte(suffixes[frameIndex]) | |
w.buf.Write(sequence[4+idEnd : idSize]) | |
} else { | |
w.buf.Write(sequence[4:idSize]) | |
w.buf.WriteByte('_') | |
w.buf.WriteByte(suffixes[frameIndex]) | |
} | |
w.newLine() | |
// if in trim mode, nb of bytes to trim (nb of successive 'X', '*' and '\n' | |
// from right end of the sequence) | |
w.toTrim = 0 | |
w.clen = 0 | |
// read the sequence 3 letters at a time, starting at a specific position | |
// corresponding to the frame | |
for pos := startPos + 2 + idSize; pos < len(sequence); pos += 3 { | |
if w.clen == maxLineSize { | |
w.newLine() | |
} | |
// create an uint32 from the codon, to retrieve the corresponding | |
// AA from the map | |
codonCode := uint32(sequence[pos-2]) | uint32(sequence[pos-1])<<8 | uint32(sequence[pos])<<16 | |
if b := codes[codonCode]; b != byte(0) { | |
w.addByte(b) | |
} else { | |
w.addUnknown() | |
} | |
} | |
// the last codon is only 2 nucleotid long, try to guess | |
// the corresponding AA | |
if (nuclSeqLength-startPos)%3 == 2 { | |
if w.clen == maxLineSize { | |
w.newLine() | |
} | |
codonCode := uint32(sequence[len(sequence)-2]) | uint32(sequence[len(sequence)-1])<<8 | |
if b := codes[codonCode]; b == byte(0) { | |
w.addUnknown() | |
} else { | |
w.addByte(b) | |
} | |
} | |
// the last codon is only 1 nucleotid long, no way to guess | |
// the corresponding AA | |
if (nuclSeqLength-startPos)%3 == 1 { | |
if w.clen == maxLineSize { | |
w.newLine() | |
} | |
w.addUnknown() | |
} | |
if options.Trim && w.toTrim > 0 { | |
// remove the last bytesToTrim bytes of the buffer | |
// as they are 'X', '*' or '\n' | |
w.buf.Truncate(w.buf.Len() - w.toTrim) | |
w.clen -= w.toTrim | |
} | |
if w.clen != 0 { | |
w.newLine() | |
} | |
frameIndex++ | |
} | |
if reverse && frameIndex < 6 { | |
// get the complementary sequence. | |
// Basically, switch | |
// A <-> T | |
// C <-> G | |
// N is not modified | |
for i, n := range sequence[idSize:] { | |
switch n { | |
case aCode: | |
sequence[i+idSize] = tCode | |
case tCode: | |
// handle both tCode and uCode | |
sequence[i+idSize] = aCode | |
case cCode: | |
sequence[i+idSize] = gCode | |
case gCode: | |
sequence[i+idSize] = cCode | |
} | |
} | |
// reverse the sequence | |
for i, j := idSize, len(sequence)-1; i < j; i, j = i+1, j-1 { | |
sequence[i], sequence[j] = sequence[j], sequence[i] | |
} | |
if !options.Alternative { | |
// Staden convention: Frame -1 is the reverse-complement of the sequence | |
// having the same codon phase as frame 1. Frame -2 is the same phase as | |
// frame 2. Frame -3 is the same phase as frame 3 | |
// | |
// use the matrix to keep track of the forward frame as it depends on the | |
// length of the sequence | |
switch nuclSeqLength % 3 { | |
case 0: | |
starts = []int{0, 2, 1} | |
case 1: | |
starts = []int{1, 0, 2} | |
case 2: | |
starts = []int{2, 1, 0} | |
} | |
} | |
// run the same loop, but with the reverse-complemented sequence | |
getComplexityAndAlternate(starts, frames, frameIndex, sequence, idSize, | |
w, codes, nuclSeqLength, options, reverse) | |
} | |
} | |
func worker(wg *sync.WaitGroup, fnaSeqs chan encodedSequence, | |
ctx context.Context, frames []int, codes []byte, | |
options Options, reverse bool, out io.Writer, errs chan error, | |
cancel context.CancelFunc) { | |
defer wg.Done() | |
w := &writer{ | |
buf: bytes.NewBuffer(nil), | |
} | |
for s := range fnaSeqs { | |
select { | |
case <-ctx.Done(): | |
return | |
default: | |
} | |
starts := []int{0, 1, 2} | |
idSize := int(binary.LittleEndian.Uint32(s[0:4])) | |
nuclSeqLength := len(s) - idSize | |
getComplexityAndAlternate(starts, frames, 0, | |
s, idSize, *w, codes, nuclSeqLength, options, reverse) | |
if w.buf.Len() > maxBufferSize { | |
writeOrCancel(*w, out, errs, cancel) | |
w.buf.Reset() | |
} | |
pool.Put(s) | |
} | |
if w.buf.Len() > 0 { | |
writeOrCancel(*w, out, errs, cancel) | |
} | |
} | |
// Translate read a fata file, translate each sequence to the corresponding prot sequence in the specified frame | |
func Translate(inputSequence io.Reader, out io.Writer, options Options) error { | |
codes, err := createCodes(options.Table, options.Clean) | |
if err != nil { | |
return err | |
} | |
frames, reverse, err := computeFrames(options.Frame) | |
if err != nil { | |
return err | |
} | |
fnaSeqs := make(chan encodedSequence, 10) | |
errs := make(chan error, 1) | |
ctx, cancel := context.WithCancel(context.Background()) | |
defer cancel() | |
var wg sync.WaitGroup | |
wg.Add(options.NumWorker) | |
for nWorker := 0; nWorker < options.NumWorker; nWorker++ { | |
go func() { | |
worker(&wg, fnaSeqs, ctx, frames, codes, options, | |
reverse, out, errs, cancel) | |
}() | |
} | |
readSequenceFromFasta(ctx, inputSequence, fnaSeqs) | |
wg.Wait() | |
select { | |
case err, ok := <-errs: | |
if ok { | |
return err | |
} | |
default: | |
} | |
return nil | |
} | |
func readSequenceFromFasta(ctx context.Context, inputSequence io.Reader, | |
fnaSeqs chan encodedSequence) { | |
feeder := &fastaChannelFeeder{ | |
idBuffer: bytes.NewBuffer(nil), | |
commentBuffer: bytes.NewBuffer(nil), | |
sequenceBuffer: bytes.NewBuffer(nil), | |
fastaChan: fnaSeqs, | |
} | |
// fasta format is: | |
// | |
// >sequenceID some comments on sequence | |
// ACAGGCAGAGACACGACAGACGACGACACAGGAGCAGACAGCAGCAGACGACCACATATT | |
// TTTGCGGTCACATGACGACTTCGGCAGCGA | |
// | |
// see https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp | |
// section 1 for details | |
scanner := bufio.NewScanner(inputSequence) | |
Loop: | |
for scanner.Scan() { | |
line := scanner.Bytes() | |
if len(line) == 0 { | |
continue | |
} | |
if line[0] == '>' { | |
if feeder.idBuffer.Len() > 0 { | |
select { | |
case <-ctx.Done(): | |
break Loop | |
default: | |
} | |
feeder.sendFasta() | |
} | |
feeder.reset() | |
// parse the ID of the sequence. ID is formatted like this: | |
// >sequenceID comments | |
seqID := bytes.SplitN(line, []byte{' '}, 2) | |
feeder.idBuffer.Write(seqID[0]) | |
if len(seqID) > 1 { | |
feeder.commentBuffer.WriteByte(' ') | |
feeder.commentBuffer.Write(seqID[1]) | |
} | |
} else { | |
// if the line doesn't start with '>', then it's a part of the | |
// nucleotide sequence, so write it to the buffer | |
feeder.sequenceBuffer.Write(line) | |
} | |
} | |
// don't forget to push last sequence | |
select { | |
case <-ctx.Done(): | |
default: | |
feeder.sendFasta() | |
} | |
close(fnaSeqs) | |
} | |
// a type to hold an encoded fasta sequence | |
// | |
// s[0:4] stores the size of the sequence id + the size of the comment as an uint32 (little endian) | |
// s[4:idSize] stores the sequence id, and the comment id there is one | |
// s[idSize:] stores the nucl sequence | |
type encodedSequence []byte | |
var pool = sync.Pool{ | |
New: func() interface{} { | |
return make(encodedSequence, 512) | |
}, | |
} | |
func getSizedSlice(idSize, requiredSize int) encodedSequence { | |
s := pool.Get().(encodedSequence) | |
binary.LittleEndian.PutUint32(s[0:4], uint32(idSize)) | |
for len(s) < requiredSize { | |
s = append(s, byte(0)) | |
} | |
return s[0:requiredSize] | |
} | |
func (f *fastaChannelFeeder) sendFasta() { | |
idSize := 4 + f.idBuffer.Len() + f.commentBuffer.Len() | |
requiredSize := idSize + f.sequenceBuffer.Len() | |
s := getSizedSlice(idSize, requiredSize) | |
if f.commentBuffer.Len() > 0 { | |
copy(s[idSize-f.commentBuffer.Len():idSize], f.commentBuffer.Bytes()) | |
} | |
copy(s[4:4+f.idBuffer.Len()], f.idBuffer.Bytes()) | |
// convert the sequence of bytes to an array of uint8 codes, | |
// so a codon (3 nucleotides | 3 bytes ) can be represented | |
// as an uint32 | |
for i, b := range f.sequenceBuffer.Bytes() { | |
switch b { | |
case 'A': | |
s[i+idSize] = aCode | |
case 'C': | |
s[i+idSize] = cCode | |
case 'G': | |
s[i+idSize] = gCode | |
case 'T', 'U': | |
s[i+idSize] = tCode | |
case 'N': | |
s[i+idSize] = nCode | |
default: | |
fmt.Printf("WARNING: invalid char in sequence %s: %s, ignoring", s[4:4+idSize], string(b)) | |
} | |
} | |
f.fastaChan <- s | |
} | |
type fastaChannelFeeder struct { | |
idBuffer *bytes.Buffer | |
commentBuffer *bytes.Buffer | |
sequenceBuffer *bytes.Buffer | |
fastaChan chan encodedSequence | |
} | |
func (f *fastaChannelFeeder) reset() { | |
f.idBuffer.Reset() | |
f.sequenceBuffer.Reset() | |
f.commentBuffer.Reset() | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package transeq | |
import "bytes" | |
const ( | |
stopByte = '*' | |
unknown = 'X' | |
) | |
type writer struct { | |
buf *bytes.Buffer | |
clen int | |
toTrim int | |
} | |
func newWriter(buf []byte) *writer { | |
return &writer{buf: bytes.NewBuffer(buf)} | |
} | |
func (w *writer) addByte(b byte) { | |
w.buf.WriteByte(b) | |
w.clen++ | |
if b == stopByte || b == unknown { | |
w.toTrim++ | |
} else { | |
w.toTrim = 0 | |
} | |
} | |
func (w *writer) addUnknown() { | |
w.buf.WriteByte(unknown) | |
w.clen++ | |
w.toTrim++ | |
} | |
func (w *writer) newLine() { | |
w.buf.WriteByte('\n') | |
w.clen = 0 | |
w.toTrim++ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment