Skip to content

Instantly share code, notes, and snippets.

@lh3
Last active February 9, 2016 18:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lh3/d11331bdf4d625fdffac to your computer and use it in GitHub Desktop.
Save lh3/d11331bdf4d625fdffac to your computer and use it in GitHub Desktop.

This gist implements a proof-of-concept web server to provide remote access to multiple BAMs. You can launch the server with:

go run hts-server.go -e /path/to/samtools bam-dir1 bam-dir2

and test it with:

curl -s http://127.0.0.1:8000/

The server regards bam-dir1 and bam-dir2 as study accessions. For a BAM file bam-dir1/myfile1.bam, the file accession is myfile1. It is required that every file has a unique file name across all input directories (i.e. even in two directories, two files must be named differently).

#!/usr/bin/perl -w
use strict;
use warnings;
my $srv = 'http://bamsvr.herokuapp.com';
my ($study, $sample, $chr, $start, $end) = ('SGDP', 'S_French-2', '11', 10890000, 10900000);
# file discovery
my @ac = split /[\t\n]+/, `curl -sd "study=$study&sample=$sample" "$srv/search"|cut -f1|sort|uniq`;
# retrieve alignment data
if (@ac == 1) { # one and only one accession
open FH, qq(curl -sd "ac=$ac[0]&chr=$chr&start=$start&end=$end&fmt=sam" "$srv/get"|) || die;
print while (<FH>);
close(FH);
}
package main
import (
"os"
"os/exec"
"io"
"fmt"
"strings"
"time"
"path/filepath"
"bufio"
"regexp"
"net/http"
"strconv"
)
/****************
* BSD getopt() *
****************/
var optind int = 1
var getopt_place int = -1
func getopt(args []string, ostr string) (int, string) {
if getopt_place == -1 { // update scanning pointer
if optind >= len(args) || args[optind][0] != '-' {
getopt_place = -1
return -1, ""
}
if optind < len(args) {
getopt_place = 0
}
if getopt_place + 1 < len(args[optind]) {
getopt_place += 1
if args[optind][getopt_place] == '-' { // found "--"
optind += 1
getopt_place = -1
return -1, ""
}
}
}
optopt := args[optind][getopt_place];
getopt_place += 1
oli, arg := strings.IndexByte(ostr, optopt), "";
if optopt == ':' || oli < 0 {
if optopt == '-' {
return -1, ""
}
if getopt_place < 0 {
optind += 1
}
return '?', ""
}
if oli + 1 >= len(ostr) || ostr[oli+1] != ':' {
if getopt_place < 0 || getopt_place >= len(args[optind]) {
optind += 1
getopt_place = -1
}
} else {
if getopt_place >= 0 && getopt_place < len(args[optind]) {
arg = args[optind][getopt_place:]
} else if optind += 1; len(args) <= optind { // no arguments
getopt_place = -1
if len(ostr) > 0 && ostr[0] == ':' {
return ':', ""
}
return '?', ""
} else {
arg = args[optind]
}
getopt_place = -1
optind += 1
}
return int(optopt), arg
}
/***************
* BAM Crawler *
***************/
var hs_samtools = "./samtools";
type Metadata struct {
ac string
study string
sample string
format string
asm string
path string
}
var hs_meta []Metadata;
func hs_crawl(dirs []string) ([]Metadata) {
var meta []Metadata;
re, _ := regexp.Compile("^@RG.*\tSM:(\\S+)");
re_fn, _ := regexp.Compile("([^\\s\\./]+)\\.([^\\s\\./]+)$");
hash := make(map[string]string);
for i := 0; i < len(dirs); i += 1 { // traverse each directory
study := filepath.Base(dirs[i]);
bams, _ := filepath.Glob(dirs[i] + "/*.bam"); // all BAM files in the directory
for j := 0; j < len(bams); j += 1 {
var ac, format string;
match := re_fn.FindStringSubmatch(bams[j]);
if len(match) > 1 {
ac, format = match[1], match[2];
}
_, ok := hash[ac];
if ok == true {
fmt.Fprintf(os.Stderr, "WARNING: duplicated accessions: %s/%s <=> %s/%s\n", hash[ac], ac, study, ac);
} else {
hash[ac] = study;
cmd := exec.Command(hs_samtools, "view", "-H", bams[j]); // extract the BAM header
pipe, _ := cmd.StdoutPipe();
sc := bufio.NewScanner(pipe);
cmd.Start();
var tmp []string;
for sc.Scan() { // read by line
match := re.FindStringSubmatch(sc.Text());
if len(match) > 1 {
present := false;
for k := 0 ; k < len(tmp); k += 1 { // FIXME: slooow given many samples in a BAM file; use a hash table instead!
if tmp[k] == match[1] {
present = true;
break;
}
}
if present == false {
tmp = append(tmp, match[1]);
}
}
}
cmd.Wait();
for k := 0; k < len(tmp); k += 1 {
meta = append(meta, Metadata { ac, study, tmp[k], format, "GRCh37", bams[j] });
}
}
}
}
return meta;
}
/*******************
* Query Callbacks *
*******************/
func hs_help(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, "This is a bam-server for demonstration purposes only. It is hosting four BAMs\n");
fmt.Fprintf(w, "in a 1Mb region 11:10,000,001-11,000,000 from GRCh37. The server has two entry\n");
fmt.Fprintf(w, "points: /search to discover file accessions and /get to retrieve data (optionally\n");
fmt.Fprintf(w, "in a region) when we know the file accession(s). The following gives a few examples\n");
fmt.Fprintf(w, "about how to retrieve data from this web server.\n\n");
fmt.Fprintf(w, "Examples in a web browser:\n\n");
fmt.Fprintf(w, " * List of studies and samples:\n");
fmt.Fprintf(w, " http://%s/search\n\n", r.Host);
fmt.Fprintf(w, " * Retrieve alignments for SGDP sample S_French-2 and S_Mbuti-1 in a small region:\n");
fmt.Fprintf(w, " http://%s/search?study=SGDP&sample=S_French-2,S_Mbuti-1\n", r.Host);
fmt.Fprintf(w, " http://%s/get?fmt=sam&ac=EXA00001,EXA00002&chr=11&start=10899000&end=10900000\n\n", r.Host);
fmt.Fprintf(w, "Command-line examples:\n\n");
fmt.Fprintf(w, " * List of studies and samples:\n");
fmt.Fprintf(w, " curl -s 'http://%s/search' > study-sample.txt\n\n", r.Host);
fmt.Fprintf(w, " * Merged BAM:\n");
fmt.Fprintf(w, " curl -s 'http://%s/get?ac=EXA00001,EXA00002&chr=11&start=10300000&end=10500000' > merged.bam\n\n", r.Host);
fmt.Fprintf(w, " * BAM download without server-side re-coding (one accession; no region conditions):\n");
fmt.Fprintf(w, " wget 'http://%s/get?ac=EXA00001'\n\n", r.Host);
fmt.Fprintf(w, " * BAM retrieval without server-side re-coding:\n");
fmt.Fprintf(w, " samtools view -b 'http://%s/get?ac=EXA00001' 11:10300000-10500000 > S_French-2_part.bam\n\n", r.Host);
fmt.Fprintf(w, " * View alignment with samtools tview (one sample; no region conditions):\n");
fmt.Fprintf(w, " samtools tview 'http://%s/get?ac=EXA00001'\n\n", r.Host);
}
func hs_collect_strings(a []string) (map[string]int) {
h := make(map[string]int);
for i := 0; i < len(a); i += 1 {
b := strings.Split(a[i], ",");
for j := 0; j < len(b); j += 1 {
h[b[j]] = 1;
}
}
return h;
}
func hs_test_hash(h map[string]int, key string) (bool) {
if len(h) > 0 {
_, ok := h[key];
return ok;
}
return true;
}
func hs_search(w http.ResponseWriter, r *http.Request) {
r.ParseForm();
ac := hs_collect_strings(r.Form["ac"]);
study := hs_collect_strings(r.Form["study"]);
sample := hs_collect_strings(r.Form["sample"]);
for i := 0; i < len(hs_meta); i += 1 {
m := hs_meta[i];
if hs_test_hash(ac, m.ac) && hs_test_hash(study, m.study) && hs_test_hash(sample, m.sample) {
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", m.ac, m.study, m.sample, m.format, m.asm);
}
}
}
func hs_get(w http.ResponseWriter, r *http.Request) {
// test if users are requesting to download BAI
isBAI := false;
if len(r.URL.RawQuery) > 4 && r.URL.RawQuery[len(r.URL.RawQuery)-4:] == ".bai" {
isBAI = true;
r.URL.RawQuery = r.URL.RawQuery[:len(r.URL.RawQuery)-4];
}
r.ParseForm();
if len(r.Form["ac"]) == 0 {
http.Error(w, "400 Bad Request: 'ac' MUST BE present", 400);
return;
}
ac := hs_collect_strings(r.Form["ac"]);
region := "";
if len(r.Form["chr"]) > 0 {
region = r.Form["chr"][0];
if len(r.Form["start"]) > 0 {
region += ":" + r.Form["start"][0];
} else {
region += ":1";
}
if len(r.Form["end"]) > 0 {
region += "-" + r.Form["end"][0];
}
}
// get the list of requested files
var files, acs []string;
for i := 0; i < len(hs_meta); i += 1 {
_, ok := ac[hs_meta[i].ac];
if ok {
files = append(files, hs_meta[i].path);
acs = append(acs, hs_meta[i].ac);
}
}
if len(files) == 0 {
http.Error(w, "400 Bad Request: non-existing 'ac'", 400);
return;
}
// set content-type
outfmt := "bam";
if len(r.Form["fmt"]) > 0 {
outfmt = r.Form["fmt"][0];
}
if outfmt == "sam" {
w.Header().Set("Content-Type", "text/plain");
} else if outfmt == "bam" {
w.Header().Set("Content-Type", "application/octet-stream");
if isBAI {
w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + ".bai" + "\"");
} else {
w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + "." + outfmt + "\"");
}
}
// write results
if len(files) == 1 && region == "" && outfmt == "bam" { // copy through
var offset int64 = 0;
fn := files[0];
if isBAI == true {
fn += ".bai";
} else if r.Header.Get("Range") != "" { // FIXME: to parse the end of the range
re, _ := regexp.Compile("bytes=(\\d+)-");
match := re.FindStringSubmatch(r.Header.Get("Range"));
if len(match) >= 2 {
offset, _ = strconv.ParseInt(match[1], 10, 64);
}
w.WriteHeader(206);
}
f, _ := os.Open(fn);
if offset > 0 {
f.Seek(offset, os.SEEK_SET);
}
defer f.Close();
buf := make([]byte, 65536);
for {
buf = buf[:cap(buf)];
n, err := f.Read(buf);
if n == 0 && err == io.EOF {
break;
}
buf = buf[:n];
w.Write(buf);
}
} else { // use "samtools merge"; this requires relative recent samtools
var args []string;
args = append(args, "merge");
if region != "" {
args = append(args, "-R"+region);
}
args = append(args, "-1O"+outfmt, "-");
args = append(args, files...);
cmd := exec.Command(hs_samtools, args...);
cmd.Stdout = w;
cmd.Run();
}
}
func main() {
var port string = "8000";
if os.Getenv("PORT") != "" {
port = os.Getenv("PORT");
}
// parse command line options
for {
opt, arg := getopt(os.Args, "e:p:");
if opt == 'p' {
port = arg;
} else if opt == 'e' {
hs_samtools = arg;
} else if opt < 0 {
break;
}
}
if optind == len(os.Args) {
fmt.Fprintln(os.Stderr, "Usage: hts-server [options] <dir1> [...]");
fmt.Fprintln(os.Stderr, "Options:");
fmt.Fprintf(os.Stderr, " -p INT port number [%s or from $PORT env]\n", port);
fmt.Fprintf(os.Stderr, " -e FILE samtools executable [%s]\n", hs_samtools);
os.Exit(1);
}
hs_meta = hs_crawl(os.Args[optind:]);
fmt.Fprintf(os.Stderr, "[%d] launched at port %s\n", time.Now().UnixNano(), port);
http.HandleFunc("/", hs_help);
http.HandleFunc("/search", hs_search);
http.HandleFunc("/get", hs_get);
http.ListenAndServe(fmt.Sprintf(":%s", port), nil);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment