(ns findorf.core
  (:use [clojure.java.io])
  (:require [clojure.contrib.str-utils2 :as str2])
  (:import [org.biojava3.core.sequence.io FastaReaderHelper]
           [org.biojava3.core.sequence.transcription Frame
            TranscriptionEngine$Builder]))

(defn longest-region [sqn-coll]
  "Retrieve start and end of the longest sequence region in a collection."
  (->>
   (loop [sqn-coll sqn-coll cur-pos 0 coords []]
     (if (empty? sqn-coll) coords
         (let [cur-len (.length (first sqn-coll))
               cur-len-w-stop (+ cur-len (if (empty? (rest sqn-coll)) 0 1))
               next-pos (+ cur-pos cur-len)]
           (recur (rest sqn-coll) next-pos (cons [cur-pos cur-len-w-stop] coords)))))
   (sort-by second >)
   first))

(defn frame-coords [[start end] size frame]
  "Orient coordinates relative to the provided frame."
  (let [map-frame (fn [[start end] frame]
                    (cond
                     (.endsWith frame "TWO") [(+ 1 start) (+ 1 end)]
                     (.endsWith frame "THREE") [(+ 2 start) (+ 2 end)]
                     :else [start end]))
        map-dir (fn [[start end] frame size]
                  (if (.startsWith frame "REVERSED")
                    [(- size end) (- size start)]
                    [start end]))]
    (-> [start end]
        (map-frame frame)
        (map-dir frame size)
        (conj frame))))

(defn tx-sqn-frame [dna frame]
  "Longest transcribed region within a given translation frame."
  (let [tx-engine (-> (TranscriptionEngine$Builder.)
                      (.trimStop false)
                      (.build))
        dna-coords (fn [coords]
                     (let [dna-start (* 3 (first coords))]
                       [dna-start (+ dna-start (* 3 (second coords)))]))]
    (-> (.getRNASequence dna tx-engine frame)
        (.getProteinSequence tx-engine)
        .toString
        (str2/partition #"\*")
        longest-region
        dna-coords
        (frame-coords (.getLength dna) (str frame)))))

(defn longest-tx-sqn [sqn]
  "Transcribe sequence, returning the longest frame."
  (->> (map #(tx-sqn-frame sqn %) (Frame/getAllFrames))
       (sort-by #(- (second %) (first %)) >)
       first))

(defn fasta-seqs [in-file]
  "Lazy steam of names and DNA sequences from input FASTA file."
  (let [seq-map (FastaReaderHelper/readFastaDNASequence (file in-file) true)]
    (for [key (.keySet seq-map)]
      [key (.get seq-map key)])))

(defn -main [fasta-file]
  (doseq [[name seq] (fasta-seqs fasta-file)]
    (println name (longest-tx-sqn seq))))