-- find longest ORF in a Fasta file, inspired by a question on BioStar 
-- (http://biostar.stackexchange.com/questions/5902/code-golf-finding-orf-and-corresponding-strand-in-a-dna-sequence/)

import Bio.Sequence
import System.IO
import Data.List (maximumBy, tails)

main :: IO ()
main = hReadFasta stdin >>= mapM_ (putStrLn . doit . castToNuc)

doit s = format s . maximumBy comp_orflength . all_orfs $ s

format :: Sequence Nuc -> (Frame,Length,Offset) -> String
format s (frm, len, off)
  | frm > 0 = sq++"+"++show frm++" "++show (frm+off)++" "++show (frm+off+len'-1)
  | otherwise = sq++show frm++" "++show (sl+frm-off-len'+2)++" "++show (sl-off+frm+1)
    where len' = fromIntegral len
          sl = fromIntegral (seqlength s)
          sq = toStr (seqheader s) ++ ":\t"

comp_orflength (_,l1,_) (_,l2,_) = compare l1 l2

type Frame = Offset -- -3..+3
type Length = Int

-- | Put it all togehter, and generate all ORFs from a sequence.
all_orfs :: Sequence Nuc -> [(Frame,Length,Offset)]
all_orfs = map (findlength . truncateStp) . concat . map transtails . translations

-- | Convert the translated sequence into the equivalent nucleotide length
findlength :: (Frame,Offset,[Amino]) -> (Frame,Length,Offset)
findlength (f,o,as) = (f,3*length as,o)

-- | Truncate the translation if and when a stop codon is reached.
truncateStp :: (Frame,Offset,[Amino]) -> (Frame,Offset,[Amino])
truncateStp (f,o,as) = (f,o,takeUntil (==STP) as)

-- | Just like takeWhile, but keeps the final element (ie. STP)
takeUntil p (x:xs) | p x = [x]
                   | otherwise = x : takeUntil p xs
takeUntil _ [] = []

-- | Analogous to the 'tails' function, this generates all suffixes
--   of a translation.
transtails :: (Frame,[Amino]) -> [(Frame,Offset,[Amino])]
transtails (f,as) = [(f,i,ts) | (i,ts) <- zip [0,3..] (tails as)]

-- | Generate all six-frame translations of the sequence, recording 
--   the frame as well.  The order decides which one gets selected later        
--   on in the case of equally long ORFs.        
translations :: Sequence Nuc -> [(Frame,[Amino])]
translations s = [(negate i,translate (revcompl s) (i-1)) | i <- [3,2,1]] ++ [(i,translate s (i-1)) | i <- [3,2,1]] 
