Skip to content

Instantly share code, notes, and snippets.

@quwubin
Created September 18, 2014 02:28
Show Gist options
  • Save quwubin/fdf9a9b40f4c4fbbeb02 to your computer and use it in GitHub Desktop.
Save quwubin/fdf9a9b40f4c4fbbeb02 to your computer and use it in GitHub Desktop.
A fasta parser in go lang
package main
import (
"bytes"
"fmt"
"log"
"os"
"strings"
"bufio"
"io"
)
type fasta struct {
id string
desc string
seq string
}
func build_fasta(header string, seq bytes.Buffer) (record fasta) {
fields := strings.SplitN(header, " ", 2)
if len(fields) > 1 {
record.id = fields[0]
record.desc = fields[1]
}else{
record.id = fields[0]
record.desc = ""
}
record.seq = seq.String()
return record
}
func parse(fastaFh io.Reader) chan fasta {
outputChannel := make(chan fasta)
scanner := bufio.NewScanner(fastaFh)
// scanner.Split(bufio.ScanLines)
header := ""
var seq bytes.Buffer
go func() {
// Loop over the letters in inputString
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if len(line) == 0 {
continue
}
// line := scanner.Text()
if line[0] == '>' {
// If we stored a previous identifier, get the DNA string and map to the
// identifier and clear the string
if header != "" {
// outputChannel <- build_fasta(header, seq.String())
outputChannel <- build_fasta(header, seq)
// fmt.Println(record.id, len(record.seq))
header = ""
seq.Reset()
}
// Standard FASTA identifiers look like: ">id desc"
header = line[1:]
} else {
// Append here since multi-line DNA strings are possible
seq.WriteString(line)
}
}
outputChannel <- build_fasta(header, seq)
// Close the output channel, so anything that loops over it
// will know that it is finished.
close(outputChannel)
}()
return outputChannel
}
func main() {
fastaFh, err := os.Open(os.Args[1])
if err != nil {
log.Fatal(err)
}
defer fastaFh.Close()
for record := range parse(fastaFh) {
fmt.Println(record.id, len(record.seq), record.seq[:100])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment