Skip to content

Instantly share code, notes, and snippets.

@esote
Created January 8, 2019 07:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esote/24e6c0e641e1af9502f1c3a0caf03c11 to your computer and use it in GitHub Desktop.
Save esote/24e6c0e641e1af9502f1c3a0caf03c11 to your computer and use it in GitHub Desktop.
package transeq
import (
"bufio"
"bytes"
"context"
"encoding/binary"
"fmt"
"io"
"sync"
"github.com/feliixx/gotranseq/ncbicode"
)
// Options struct to store command line args
type Options struct {
Required `group:"required"`
Optional `group:"optional"`
General `group:"general"`
}
// Required struct to store required command line args
type Required struct {
Sequence string `short:"s" long:"sequence" value-name:"<filename>" description:"Nucleotide sequence(s) filename"`
Outseq string `short:"o" long:"outseq" value-name:"<filename>" description:"Protein sequence filename"`
}
// Optional struct to store required command line args
type Optional struct {
Frame string `short:"f" long:"frame" value-name:"<code>" description:"Frame to translate. Possible values:\n [1, 2, 3, F, -1, -2, -3, R, 6]\n F: forward three frames\n R: reverse three frames\n 6: all 6 frames\n" default:"1"`
Table int `short:"t" long:"table" value-name:"<code>" description:"NCBI code to use, see https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?chapter=tgencodes#SG1 for details. Available codes: \n 0: Standard code\n 2: The Vertebrate Mitochondrial Code\n 3: The Yeast Mitochondrial Code\n 4: The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code\n 5: The Invertebrate Mitochondrial Code\n 6: The Ciliate, Dasycladacean and Hexamita Nuclear Code\n 9: The Echinoderm and Flatworm Mitochondrial Code\n 10: The Euplotid Nuclear Code\n 11: The Bacterial, Archaeal and Plant Plastid Code\n 12: The Alternative Yeast Nuclear Code\n 13: The Ascidian Mitochondrial Code\n 14: The Alternative Flatworm Mitochondrial Code\n16: Chlorophycean Mitochondrial Code\n 21: Trematode Mitochondrial Code\n22: Scenedesmus obliquus Mitochondrial Code\n 23: Thraustochytrium Mitochondrial Code\n 24: Pterobranchia Mitochondrial Code\n 25: Candidate Division SR1 and Gracilibacteria Code\n 26: Pachysolen tannophilus Nuclear Code\n 29: Mesodinium Nuclear\n 30: Peritrich Nuclear\n" default:"0"`
Clean bool `short:"c" long:"clean" description:"Replace stop codon '*' by 'X'"`
Alternative bool `short:"a" long:"alternative" description:"Define frame '-1' as using the set of codons starting with the last codon of the sequence"`
Trim bool `short:"T" long:"trim" description:"Removes all 'X' and '*' characters from the right end of the translation. The trimming process starts at the end and continues until the next character is not a 'X' or a '*'"`
NumWorker int `short:"n" long:"numcpu" value-name:"<n>" description:"Number of threads to use, default is number of CPU"`
}
// General struct to store required command line args
type General struct {
Help bool `short:"h" long:"help" description:"Show this help message"`
Version bool `short:"v" long:"version" description:"Print the tool version and exit"`
}
const (
nCode = iota
aCode
cCode
tCode
uCode
gCode
// Length of the array to store code/bytes
// uses gCode because it's the biggest of all codes
codesSize = (gCode | gCode<<8 | gCode<<16) + 1
// size of the buffer for writing to file
maxBufferSize = 1024 * 1024 * 30
// max line size for sequence
maxLineSize = 60
// suffixes ta add to sequence id for each frame
suffixes = "123456"
)
var letterCode = map[byte]uint8{
'A': aCode,
'C': cCode,
'T': tCode,
'G': gCode,
'N': nCode,
'U': uCode,
}
// create the code map according to the selected table code
func createCodes(code int, clean bool) ([]byte, error) {
resultMap := map[uint32]byte{}
twoLetterMap := map[string][]byte{}
tmpCode := make([]uint8, 4)
codeMap, err := ncbicode.LoadTableCode(code)
if err != nil {
return nil, err
}
for codon, aaCode := range codeMap {
// generate 3 letter code
tmpCode[0] = letterCode[codon[0]]
tmpCode[1] = letterCode[codon[1]]
tmpCode[2] = letterCode[codon[2]]
// each codon is represented by an unique uint32:
// each possible nucleotide is represented by an uint8 (255 possibility)
// the three first bytes are the the code for each nucleotide
// last byte is unused ( eq to uint8(0) )
// example:
// codon 'ACG' ==> uint32(aCode) | uint32(cCode)<<8 | uint32(gCode)<<16
uint32Code := uint32(tmpCode[0]) | uint32(tmpCode[1])<<8 | uint32(tmpCode[2])<<16
resultMap[uint32Code] = aaCode
// generate 2 letter code
if codes, ok := twoLetterMap[codon[0:2]]; ok {
twoLetterMap[codon[0:2]] = append(codes, aaCode)
} else {
twoLetterMap[codon[0:2]] = []byte{aaCode}
}
}
for twoLetterCodon, codes := range twoLetterMap {
uniqueAA := true
for _, c := range codes {
if c != codes[0] {
uniqueAA = false
break
}
}
if uniqueAA {
first := letterCode[twoLetterCodon[0]]
second := letterCode[twoLetterCodon[1]]
uint32Code := uint32(first) | uint32(second)<<8
resultMap[uint32Code] = codes[0]
}
}
// if clean is specified, we want to replace all '*' by 'X' in the output
// sequence, so replace all occurrences of '*' directly in the ref map
if clean {
for k, v := range resultMap {
if v == stopByte {
resultMap[k] = unknown
}
}
}
r := make([]byte, codesSize)
for k, v := range resultMap {
r[k] = v
}
return r, nil
}
func computeFrames(frameName string) ([]int, bool, error) {
frames := make([]int, 6)
reverse := false
switch frameName {
case "1":
frames[0] = 1
case "2":
frames[1] = 1
case "3":
frames[2] = 1
case "F":
frames[0] = 1
frames[1] = 1
frames[2] = 1
case "-1":
frames[3] = 1
reverse = true
case "-2":
frames[4] = 1
reverse = true
case "-3":
frames[5] = 1
reverse = true
case "R":
frames[3] = 1
frames[4] = 1
frames[5] = 1
reverse = true
case "6":
for i := range frames {
frames[i] = 1
}
reverse = true
default:
return frames, reverse, fmt.Errorf("wrong value for -f | --frame parameter: %s", frameName)
}
return frames, reverse, nil
}
func writeOrCancel(w writer, out io.Writer, errs chan error,
cancel context.CancelFunc) {
if _, err := out.Write(w.buf.Bytes()); err != nil {
select {
case errs <- fmt.Errorf("fail to write to output file: %v", err):
default:
}
cancel()
return
}
}
func getComplexityAndAlternate(starts []int, frames []int,
frameIndex int, sequence encodedSequence, idSize int, w writer,
codes []byte, nuclSeqLength int, options Options, reverse bool) {
for _, startPos := range starts {
if frames[frameIndex] == 0 {
frameIndex++
continue
}
// sequence id should look like
// >sequenceID_<frame> comment
idEnd := bytes.IndexByte(sequence[4:idSize], ' ')
if idEnd != -1 {
w.buf.Write(sequence[4 : 4+idEnd])
w.buf.WriteByte('_')
w.buf.WriteByte(suffixes[frameIndex])
w.buf.Write(sequence[4+idEnd : idSize])
} else {
w.buf.Write(sequence[4:idSize])
w.buf.WriteByte('_')
w.buf.WriteByte(suffixes[frameIndex])
}
w.newLine()
// if in trim mode, nb of bytes to trim (nb of successive 'X', '*' and '\n'
// from right end of the sequence)
w.toTrim = 0
w.clen = 0
// read the sequence 3 letters at a time, starting at a specific position
// corresponding to the frame
for pos := startPos + 2 + idSize; pos < len(sequence); pos += 3 {
if w.clen == maxLineSize {
w.newLine()
}
// create an uint32 from the codon, to retrieve the corresponding
// AA from the map
codonCode := uint32(sequence[pos-2]) | uint32(sequence[pos-1])<<8 | uint32(sequence[pos])<<16
if b := codes[codonCode]; b != byte(0) {
w.addByte(b)
} else {
w.addUnknown()
}
}
// the last codon is only 2 nucleotid long, try to guess
// the corresponding AA
if (nuclSeqLength-startPos)%3 == 2 {
if w.clen == maxLineSize {
w.newLine()
}
codonCode := uint32(sequence[len(sequence)-2]) | uint32(sequence[len(sequence)-1])<<8
if b := codes[codonCode]; b == byte(0) {
w.addUnknown()
} else {
w.addByte(b)
}
}
// the last codon is only 1 nucleotid long, no way to guess
// the corresponding AA
if (nuclSeqLength-startPos)%3 == 1 {
if w.clen == maxLineSize {
w.newLine()
}
w.addUnknown()
}
if options.Trim && w.toTrim > 0 {
// remove the last bytesToTrim bytes of the buffer
// as they are 'X', '*' or '\n'
w.buf.Truncate(w.buf.Len() - w.toTrim)
w.clen -= w.toTrim
}
if w.clen != 0 {
w.newLine()
}
frameIndex++
}
if reverse && frameIndex < 6 {
// get the complementary sequence.
// Basically, switch
// A <-> T
// C <-> G
// N is not modified
for i, n := range sequence[idSize:] {
switch n {
case aCode:
sequence[i+idSize] = tCode
case tCode:
// handle both tCode and uCode
sequence[i+idSize] = aCode
case cCode:
sequence[i+idSize] = gCode
case gCode:
sequence[i+idSize] = cCode
}
}
// reverse the sequence
for i, j := idSize, len(sequence)-1; i < j; i, j = i+1, j-1 {
sequence[i], sequence[j] = sequence[j], sequence[i]
}
if !options.Alternative {
// Staden convention: Frame -1 is the reverse-complement of the sequence
// having the same codon phase as frame 1. Frame -2 is the same phase as
// frame 2. Frame -3 is the same phase as frame 3
//
// use the matrix to keep track of the forward frame as it depends on the
// length of the sequence
switch nuclSeqLength % 3 {
case 0:
starts = []int{0, 2, 1}
case 1:
starts = []int{1, 0, 2}
case 2:
starts = []int{2, 1, 0}
}
}
// run the same loop, but with the reverse-complemented sequence
getComplexityAndAlternate(starts, frames, frameIndex, sequence, idSize,
w, codes, nuclSeqLength, options, reverse)
}
}
func worker(wg *sync.WaitGroup, fnaSeqs chan encodedSequence,
ctx context.Context, frames []int, codes []byte,
options Options, reverse bool, out io.Writer, errs chan error,
cancel context.CancelFunc) {
defer wg.Done()
w := &writer{
buf: bytes.NewBuffer(nil),
}
for s := range fnaSeqs {
select {
case <-ctx.Done():
return
default:
}
starts := []int{0, 1, 2}
idSize := int(binary.LittleEndian.Uint32(s[0:4]))
nuclSeqLength := len(s) - idSize
getComplexityAndAlternate(starts, frames, 0,
s, idSize, *w, codes, nuclSeqLength, options, reverse)
if w.buf.Len() > maxBufferSize {
writeOrCancel(*w, out, errs, cancel)
w.buf.Reset()
}
pool.Put(s)
}
if w.buf.Len() > 0 {
writeOrCancel(*w, out, errs, cancel)
}
}
// Translate read a fata file, translate each sequence to the corresponding prot sequence in the specified frame
func Translate(inputSequence io.Reader, out io.Writer, options Options) error {
codes, err := createCodes(options.Table, options.Clean)
if err != nil {
return err
}
frames, reverse, err := computeFrames(options.Frame)
if err != nil {
return err
}
fnaSeqs := make(chan encodedSequence, 10)
errs := make(chan error, 1)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
var wg sync.WaitGroup
wg.Add(options.NumWorker)
for nWorker := 0; nWorker < options.NumWorker; nWorker++ {
go func() {
worker(&wg, fnaSeqs, ctx, frames, codes, options,
reverse, out, errs, cancel)
}()
}
readSequenceFromFasta(ctx, inputSequence, fnaSeqs)
wg.Wait()
select {
case err, ok := <-errs:
if ok {
return err
}
default:
}
return nil
}
func readSequenceFromFasta(ctx context.Context, inputSequence io.Reader,
fnaSeqs chan encodedSequence) {
feeder := &fastaChannelFeeder{
idBuffer: bytes.NewBuffer(nil),
commentBuffer: bytes.NewBuffer(nil),
sequenceBuffer: bytes.NewBuffer(nil),
fastaChan: fnaSeqs,
}
// fasta format is:
//
// >sequenceID some comments on sequence
// ACAGGCAGAGACACGACAGACGACGACACAGGAGCAGACAGCAGCAGACGACCACATATT
// TTTGCGGTCACATGACGACTTCGGCAGCGA
//
// see https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=BlastHelp
// section 1 for details
scanner := bufio.NewScanner(inputSequence)
Loop:
for scanner.Scan() {
line := scanner.Bytes()
if len(line) == 0 {
continue
}
if line[0] == '>' {
if feeder.idBuffer.Len() > 0 {
select {
case <-ctx.Done():
break Loop
default:
}
feeder.sendFasta()
}
feeder.reset()
// parse the ID of the sequence. ID is formatted like this:
// >sequenceID comments
seqID := bytes.SplitN(line, []byte{' '}, 2)
feeder.idBuffer.Write(seqID[0])
if len(seqID) > 1 {
feeder.commentBuffer.WriteByte(' ')
feeder.commentBuffer.Write(seqID[1])
}
} else {
// if the line doesn't start with '>', then it's a part of the
// nucleotide sequence, so write it to the buffer
feeder.sequenceBuffer.Write(line)
}
}
// don't forget to push last sequence
select {
case <-ctx.Done():
default:
feeder.sendFasta()
}
close(fnaSeqs)
}
// a type to hold an encoded fasta sequence
//
// s[0:4] stores the size of the sequence id + the size of the comment as an uint32 (little endian)
// s[4:idSize] stores the sequence id, and the comment id there is one
// s[idSize:] stores the nucl sequence
type encodedSequence []byte
var pool = sync.Pool{
New: func() interface{} {
return make(encodedSequence, 512)
},
}
func getSizedSlice(idSize, requiredSize int) encodedSequence {
s := pool.Get().(encodedSequence)
binary.LittleEndian.PutUint32(s[0:4], uint32(idSize))
for len(s) < requiredSize {
s = append(s, byte(0))
}
return s[0:requiredSize]
}
func (f *fastaChannelFeeder) sendFasta() {
idSize := 4 + f.idBuffer.Len() + f.commentBuffer.Len()
requiredSize := idSize + f.sequenceBuffer.Len()
s := getSizedSlice(idSize, requiredSize)
if f.commentBuffer.Len() > 0 {
copy(s[idSize-f.commentBuffer.Len():idSize], f.commentBuffer.Bytes())
}
copy(s[4:4+f.idBuffer.Len()], f.idBuffer.Bytes())
// convert the sequence of bytes to an array of uint8 codes,
// so a codon (3 nucleotides | 3 bytes ) can be represented
// as an uint32
for i, b := range f.sequenceBuffer.Bytes() {
switch b {
case 'A':
s[i+idSize] = aCode
case 'C':
s[i+idSize] = cCode
case 'G':
s[i+idSize] = gCode
case 'T', 'U':
s[i+idSize] = tCode
case 'N':
s[i+idSize] = nCode
default:
fmt.Printf("WARNING: invalid char in sequence %s: %s, ignoring", s[4:4+idSize], string(b))
}
}
f.fastaChan <- s
}
type fastaChannelFeeder struct {
idBuffer *bytes.Buffer
commentBuffer *bytes.Buffer
sequenceBuffer *bytes.Buffer
fastaChan chan encodedSequence
}
func (f *fastaChannelFeeder) reset() {
f.idBuffer.Reset()
f.sequenceBuffer.Reset()
f.commentBuffer.Reset()
}
package transeq
import "bytes"
const (
stopByte = '*'
unknown = 'X'
)
type writer struct {
buf *bytes.Buffer
clen int
toTrim int
}
func newWriter(buf []byte) *writer {
return &writer{buf: bytes.NewBuffer(buf)}
}
func (w *writer) addByte(b byte) {
w.buf.WriteByte(b)
w.clen++
if b == stopByte || b == unknown {
w.toTrim++
} else {
w.toTrim = 0
}
}
func (w *writer) addUnknown() {
w.buf.WriteByte(unknown)
w.clen++
w.toTrim++
}
func (w *writer) newLine() {
w.buf.WriteByte('\n')
w.clen = 0
w.toTrim++
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment