|
package main |
|
|
|
import ( |
|
"os" |
|
"os/exec" |
|
"io" |
|
"fmt" |
|
"strings" |
|
"time" |
|
"path/filepath" |
|
"bufio" |
|
"regexp" |
|
"net/http" |
|
"strconv" |
|
) |
|
|
|
/**************** |
|
* BSD getopt() * |
|
****************/ |
|
|
|
var optind int = 1 |
|
var getopt_place int = -1 |
|
|
|
func getopt(args []string, ostr string) (int, string) { |
|
if getopt_place == -1 { // update scanning pointer |
|
if optind >= len(args) || args[optind][0] != '-' { |
|
getopt_place = -1 |
|
return -1, "" |
|
} |
|
if optind < len(args) { |
|
getopt_place = 0 |
|
} |
|
if getopt_place + 1 < len(args[optind]) { |
|
getopt_place += 1 |
|
if args[optind][getopt_place] == '-' { // found "--" |
|
optind += 1 |
|
getopt_place = -1 |
|
return -1, "" |
|
} |
|
} |
|
} |
|
optopt := args[optind][getopt_place]; |
|
getopt_place += 1 |
|
oli, arg := strings.IndexByte(ostr, optopt), ""; |
|
if optopt == ':' || oli < 0 { |
|
if optopt == '-' { |
|
return -1, "" |
|
} |
|
if getopt_place < 0 { |
|
optind += 1 |
|
} |
|
return '?', "" |
|
} |
|
if oli + 1 >= len(ostr) || ostr[oli+1] != ':' { |
|
if getopt_place < 0 || getopt_place >= len(args[optind]) { |
|
optind += 1 |
|
getopt_place = -1 |
|
} |
|
} else { |
|
if getopt_place >= 0 && getopt_place < len(args[optind]) { |
|
arg = args[optind][getopt_place:] |
|
} else if optind += 1; len(args) <= optind { // no arguments |
|
getopt_place = -1 |
|
if len(ostr) > 0 && ostr[0] == ':' { |
|
return ':', "" |
|
} |
|
return '?', "" |
|
} else { |
|
arg = args[optind] |
|
} |
|
getopt_place = -1 |
|
optind += 1 |
|
} |
|
return int(optopt), arg |
|
} |
|
|
|
/*************** |
|
* BAM Crawler * |
|
***************/ |
|
|
|
var hs_samtools = "./samtools"; |
|
|
|
type Metadata struct { |
|
ac string |
|
study string |
|
sample string |
|
format string |
|
asm string |
|
path string |
|
} |
|
|
|
var hs_meta []Metadata; |
|
|
|
func hs_crawl(dirs []string) ([]Metadata) { |
|
var meta []Metadata; |
|
re, _ := regexp.Compile("^@RG.*\tSM:(\\S+)"); |
|
re_fn, _ := regexp.Compile("([^\\s\\./]+)\\.([^\\s\\./]+)$"); |
|
hash := make(map[string]string); |
|
for i := 0; i < len(dirs); i += 1 { // traverse each directory |
|
study := filepath.Base(dirs[i]); |
|
bams, _ := filepath.Glob(dirs[i] + "/*.bam"); // all BAM files in the directory |
|
for j := 0; j < len(bams); j += 1 { |
|
var ac, format string; |
|
match := re_fn.FindStringSubmatch(bams[j]); |
|
if len(match) > 1 { |
|
ac, format = match[1], match[2]; |
|
} |
|
_, ok := hash[ac]; |
|
if ok == true { |
|
fmt.Fprintf(os.Stderr, "WARNING: duplicated accessions: %s/%s <=> %s/%s\n", hash[ac], ac, study, ac); |
|
} else { |
|
hash[ac] = study; |
|
cmd := exec.Command(hs_samtools, "view", "-H", bams[j]); // extract the BAM header |
|
pipe, _ := cmd.StdoutPipe(); |
|
sc := bufio.NewScanner(pipe); |
|
cmd.Start(); |
|
var tmp []string; |
|
for sc.Scan() { // read by line |
|
match := re.FindStringSubmatch(sc.Text()); |
|
if len(match) > 1 { |
|
present := false; |
|
for k := 0 ; k < len(tmp); k += 1 { // FIXME: slooow given many samples in a BAM file; use a hash table instead! |
|
if tmp[k] == match[1] { |
|
present = true; |
|
break; |
|
} |
|
} |
|
if present == false { |
|
tmp = append(tmp, match[1]); |
|
} |
|
} |
|
} |
|
cmd.Wait(); |
|
for k := 0; k < len(tmp); k += 1 { |
|
meta = append(meta, Metadata { ac, study, tmp[k], format, "GRCh37", bams[j] }); |
|
} |
|
} |
|
} |
|
} |
|
return meta; |
|
} |
|
|
|
/******************* |
|
* Query Callbacks * |
|
*******************/ |
|
|
|
func hs_help(w http.ResponseWriter, r *http.Request) { |
|
fmt.Fprintf(w, "This is a bam-server for demonstration purposes only. It is hosting four BAMs\n"); |
|
fmt.Fprintf(w, "in a 1Mb region 11:10,000,001-11,000,000 from GRCh37. The server has two entry\n"); |
|
fmt.Fprintf(w, "points: /search to discover file accessions and /get to retrieve data (optionally\n"); |
|
fmt.Fprintf(w, "in a region) when we know the file accession(s). The following gives a few examples\n"); |
|
fmt.Fprintf(w, "about how to retrieve data from this web server.\n\n"); |
|
fmt.Fprintf(w, "Examples in a web browser:\n\n"); |
|
fmt.Fprintf(w, " * List of studies and samples:\n"); |
|
fmt.Fprintf(w, " http://%s/search\n\n", r.Host); |
|
fmt.Fprintf(w, " * Retrieve alignments for SGDP sample S_French-2 and S_Mbuti-1 in a small region:\n"); |
|
fmt.Fprintf(w, " http://%s/search?study=SGDP&sample=S_French-2,S_Mbuti-1\n", r.Host); |
|
fmt.Fprintf(w, " http://%s/get?fmt=sam&ac=EXA00001,EXA00002&chr=11&start=10899000&end=10900000\n\n", r.Host); |
|
fmt.Fprintf(w, "Command-line examples:\n\n"); |
|
fmt.Fprintf(w, " * List of studies and samples:\n"); |
|
fmt.Fprintf(w, " curl -s 'http://%s/search' > study-sample.txt\n\n", r.Host); |
|
fmt.Fprintf(w, " * Merged BAM:\n"); |
|
fmt.Fprintf(w, " curl -s 'http://%s/get?ac=EXA00001,EXA00002&chr=11&start=10300000&end=10500000' > merged.bam\n\n", r.Host); |
|
fmt.Fprintf(w, " * BAM download without server-side re-coding (one accession; no region conditions):\n"); |
|
fmt.Fprintf(w, " wget 'http://%s/get?ac=EXA00001'\n\n", r.Host); |
|
fmt.Fprintf(w, " * BAM retrieval without server-side re-coding:\n"); |
|
fmt.Fprintf(w, " samtools view -b 'http://%s/get?ac=EXA00001' 11:10300000-10500000 > S_French-2_part.bam\n\n", r.Host); |
|
fmt.Fprintf(w, " * View alignment with samtools tview (one sample; no region conditions):\n"); |
|
fmt.Fprintf(w, " samtools tview 'http://%s/get?ac=EXA00001'\n\n", r.Host); |
|
} |
|
|
|
func hs_collect_strings(a []string) (map[string]int) { |
|
h := make(map[string]int); |
|
for i := 0; i < len(a); i += 1 { |
|
b := strings.Split(a[i], ","); |
|
for j := 0; j < len(b); j += 1 { |
|
h[b[j]] = 1; |
|
} |
|
} |
|
return h; |
|
} |
|
|
|
func hs_test_hash(h map[string]int, key string) (bool) { |
|
if len(h) > 0 { |
|
_, ok := h[key]; |
|
return ok; |
|
} |
|
return true; |
|
} |
|
|
|
func hs_search(w http.ResponseWriter, r *http.Request) { |
|
r.ParseForm(); |
|
ac := hs_collect_strings(r.Form["ac"]); |
|
study := hs_collect_strings(r.Form["study"]); |
|
sample := hs_collect_strings(r.Form["sample"]); |
|
for i := 0; i < len(hs_meta); i += 1 { |
|
m := hs_meta[i]; |
|
if hs_test_hash(ac, m.ac) && hs_test_hash(study, m.study) && hs_test_hash(sample, m.sample) { |
|
fmt.Fprintf(w, "%s\t%s\t%s\t%s\t%s\n", m.ac, m.study, m.sample, m.format, m.asm); |
|
} |
|
} |
|
} |
|
|
|
func hs_get(w http.ResponseWriter, r *http.Request) { |
|
// test if users are requesting to download BAI |
|
isBAI := false; |
|
if len(r.URL.RawQuery) > 4 && r.URL.RawQuery[len(r.URL.RawQuery)-4:] == ".bai" { |
|
isBAI = true; |
|
r.URL.RawQuery = r.URL.RawQuery[:len(r.URL.RawQuery)-4]; |
|
} |
|
|
|
r.ParseForm(); |
|
if len(r.Form["ac"]) == 0 { |
|
http.Error(w, "400 Bad Request: 'ac' MUST BE present", 400); |
|
return; |
|
} |
|
ac := hs_collect_strings(r.Form["ac"]); |
|
|
|
region := ""; |
|
if len(r.Form["chr"]) > 0 { |
|
region = r.Form["chr"][0]; |
|
if len(r.Form["start"]) > 0 { |
|
region += ":" + r.Form["start"][0]; |
|
} else { |
|
region += ":1"; |
|
} |
|
if len(r.Form["end"]) > 0 { |
|
region += "-" + r.Form["end"][0]; |
|
} |
|
} |
|
|
|
// get the list of requested files |
|
var files, acs []string; |
|
for i := 0; i < len(hs_meta); i += 1 { |
|
_, ok := ac[hs_meta[i].ac]; |
|
if ok { |
|
files = append(files, hs_meta[i].path); |
|
acs = append(acs, hs_meta[i].ac); |
|
} |
|
} |
|
if len(files) == 0 { |
|
http.Error(w, "400 Bad Request: non-existing 'ac'", 400); |
|
return; |
|
} |
|
|
|
// set content-type |
|
outfmt := "bam"; |
|
if len(r.Form["fmt"]) > 0 { |
|
outfmt = r.Form["fmt"][0]; |
|
} |
|
if outfmt == "sam" { |
|
w.Header().Set("Content-Type", "text/plain"); |
|
} else if outfmt == "bam" { |
|
w.Header().Set("Content-Type", "application/octet-stream"); |
|
if isBAI { |
|
w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + ".bai" + "\""); |
|
} else { |
|
w.Header().Set("Content-Disposition", "attachment; filename=\"" + strings.Join(acs, ",") + "." + outfmt + "\""); |
|
} |
|
} |
|
|
|
// write results |
|
if len(files) == 1 && region == "" && outfmt == "bam" { // copy through |
|
var offset int64 = 0; |
|
fn := files[0]; |
|
if isBAI == true { |
|
fn += ".bai"; |
|
} else if r.Header.Get("Range") != "" { // FIXME: to parse the end of the range |
|
re, _ := regexp.Compile("bytes=(\\d+)-"); |
|
match := re.FindStringSubmatch(r.Header.Get("Range")); |
|
if len(match) >= 2 { |
|
offset, _ = strconv.ParseInt(match[1], 10, 64); |
|
} |
|
w.WriteHeader(206); |
|
} |
|
f, _ := os.Open(fn); |
|
if offset > 0 { |
|
f.Seek(offset, os.SEEK_SET); |
|
} |
|
defer f.Close(); |
|
buf := make([]byte, 65536); |
|
for { |
|
buf = buf[:cap(buf)]; |
|
n, err := f.Read(buf); |
|
if n == 0 && err == io.EOF { |
|
break; |
|
} |
|
buf = buf[:n]; |
|
w.Write(buf); |
|
} |
|
} else { // use "samtools merge"; this requires relative recent samtools |
|
var args []string; |
|
args = append(args, "merge"); |
|
if region != "" { |
|
args = append(args, "-R"+region); |
|
} |
|
args = append(args, "-1O"+outfmt, "-"); |
|
args = append(args, files...); |
|
cmd := exec.Command(hs_samtools, args...); |
|
cmd.Stdout = w; |
|
cmd.Run(); |
|
} |
|
} |
|
|
|
func main() { |
|
var port string = "8000"; |
|
|
|
if os.Getenv("PORT") != "" { |
|
port = os.Getenv("PORT"); |
|
} |
|
// parse command line options |
|
for { |
|
opt, arg := getopt(os.Args, "e:p:"); |
|
if opt == 'p' { |
|
port = arg; |
|
} else if opt == 'e' { |
|
hs_samtools = arg; |
|
} else if opt < 0 { |
|
break; |
|
} |
|
} |
|
if optind == len(os.Args) { |
|
fmt.Fprintln(os.Stderr, "Usage: hts-server [options] <dir1> [...]"); |
|
fmt.Fprintln(os.Stderr, "Options:"); |
|
fmt.Fprintf(os.Stderr, " -p INT port number [%s or from $PORT env]\n", port); |
|
fmt.Fprintf(os.Stderr, " -e FILE samtools executable [%s]\n", hs_samtools); |
|
os.Exit(1); |
|
} |
|
|
|
hs_meta = hs_crawl(os.Args[optind:]); |
|
|
|
fmt.Fprintf(os.Stderr, "[%d] launched at port %s\n", time.Now().UnixNano(), port); |
|
|
|
http.HandleFunc("/", hs_help); |
|
http.HandleFunc("/search", hs_search); |
|
http.HandleFunc("/get", hs_get); |
|
http.ListenAndServe(fmt.Sprintf(":%s", port), nil); |
|
} |