{- | Sim454 - simulate 454 flowgrams from a reference genome 

   Flow values are modelled from a log-normal noise, and a sequence of
   normal distributions, one for each homopolymer length.  These distributions
   can be estimated from real data.

   Todo:
   Options: -m <model> [-n <number of reads> | -r <number of runs>]
   Model as a data structure (dyn load?)
   Paired ends models B-linker-A
   
-}
{-# options -XParallelListComp #-}

module Main where

import Bio.Sequence
import Bio.Sequence.SFF

import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString as BS2

import Data.Char (toUpper)
import Data.Int
import Data.List (unfoldr)

import Statistics

import Options
import Generations.GenBase
import Generations.GS20

-- ------------------------------------------------------------
-- picking random postions/directions for reads

data Dir   = Fwd | Rev deriving (Eq,Enum,Bounded)

instance Random Dir where 
    random g = let (b,g') = random g in (if b then Fwd else Rev,g')
    randomR = error "randomR is undefined for Dir"

main = do
  -- handle options!
  opts <- getOptions
  ss <- input opts
  sff <- evalRandIO (sim454 (gen opts) $ head ss) -- fixme - use all sequences
  -- writeFile "debug.txt" (show sff)
  output opts sff

-- testing
s = Seq (fromStr "foo") (fromStr "aacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacgg") Nothing

-- | Heavy lifting.  Building an SFF file from the specified information
-- sim454 :: String -> Generation -> Model -> Sequence Nuc -> StdGen -> SFF
sim454 :: RandomGen g => Generation -> Sequence Nuc -> Rand g SFF
sim454 gen sq = do
  let ch = commonhd gen
  rbs <- sequence (replicate (fromIntegral $ num_reads ch) (makeReadBlock gen sq ch))
  return (SFF ch rbs)

-- --------------------------------------------------
-- Generate the flow values

-- generate the "absolute" flow values from the key and the origin sequence data
makeFlows :: [Char] -> SeqData -> [Int]
makeFlows c s | B.null s = []
              | otherwise = let (c1,s1) = makeCycle [] (take 4 c) s 
                                s2 = case B.uncons s1 of Just (x,_) -> if not (toUpper x `elem` "ACGT") then B.tail s1 else s1; _ -> s1
                            in c1 ++ makeFlows c s2

{-
  prop_makeFlows :: EST -> Bool
  prop_makeFlows (E s) = let sd = filter (not . (=='n')) $ toStr $ seqdata s 
                         in makeFlows "TACG" (seqdata s) == reverse (makeFlows "TACG" $ seqdata $ revcompl s)
-}

makeCycle :: [Int] -> [Char] -> SeqData -> ([Int],SeqData)
makeCycle acc []     s = (reverse acc,s)
makeCycle acc (c:cs) s = let (this,rest) = B.span ((==toUpper c).toUpper) s
                         in makeCycle (fromIntegral (B.length this):acc) cs rest


-- TODO:
-- 1. combine call and permute, in order to calculate exact quality values
-- 2. add real option handling
-- 3. allow random degradation of sequence quality (not fixed model)

-- --------------------------------------------------
-- pull a value from a model

permuteFromModel :: RandomGen g => Generation -> [Int] -> Rand g [Flow]
permuteFromModel gen = mapM permute . zip [0..]
    where permute (p,x) = do y <- sample (model gen x p)
                             return $ floor (y * 100)

-- --------------------------------------------------
-- Calling the sequence from the flows
call :: Generation -> [Char] -> [Flow] -> [(Char,Qual,Int)]
call _ _ [] = []
call g cs fs = let (low, high) = span (<50) fs
                   c:cs' = drop (length low) (cycle cs)
               in if length low >= 3 then ('N',0,length low) : call g (c:cs') high
                  else case high of (h:rest) -> call1 g c (length low+1) h ++ call g cs' rest
                                    [] -> []

call1 :: Generation -> Char -> Int -> Flow -> [(Char,Qual,Int)]
call1 g c p f = let q1:qs = getqual g f in (c,q1,p):[(c,q,0) | q <- qs]

-- testing
prop_flow_call e = undefined -- seqdata e == call GS20 "TCAG" $ makeFlows "TCAG" $ seqdata e
prop_flow_index1 e = error "check that called indices correspond to flows >=0.5"
prop_flow_index2 e = error "check that uncalled indices correspond to flows <0.5"

-- --------------------------------------------------
-- Putting it together
makeCommonHeader :: Generation -> CommonHeader
makeCommonHeader = commonhd

-- | Generate a ReadBlock
--   direction and position chosen at random, and encoded in the read name
makeReadBlock :: RandomGen g => Generation -> Sequence Nuc -> CommonHeader -> Rand g ReadBlock
makeReadBlock g sq ch = 
    do dir <- getRandom                       :: RandomGen g => Rand g Dir
       p <- getRandomR (0,(fromIntegral $ seqlength sq-1)) :: RandomGen g => Rand g Integer
       let pos = fromIntegral p
           rn = BS.pack ((toStr $ seqlabel sq)++"_"++show pos++(case dir of Fwd -> "+"; Rev -> "-"))
           sdata = let orig = B.splitAt (fromIntegral pos) (seqdata sq)
                   in case dir of Fwd -> snd orig
                                  Rev -> revcompl' $ fst orig
           fs = take (fromIntegral $ flow_length ch) $ makeFlows (BS.unpack $ flow ch) $ BL.concat [BL.fromChunks [key ch], sdata]
       pfs <- permuteFromModel g fs
       let (cs,qs,is) = unzip3 $ call g (BS.unpack $ flow ch) pfs
       return $ ReadBlock {
                           read_header = ReadHeader {
                                           name_length                 = fromIntegral $ BS.length rn -- :: Int16
                                         , num_bases                   = fromIntegral $ length cs    -- :: Int32
                                         , clip_qual_left              = 5                           -- :: Int16
                                         , clip_qual_right             = fromIntegral $ length cs    -- :: Int16
                                         , clip_adapter_left           = 0 -- :: Int16
                                         , clip_adapter_right          = 0 -- :: Int16
                                         , read_name                   = rn -- :: ByteString
                                         }
                         -- The data block
                          , flowgram         = pfs -- :: [Flow]
                          , flow_index       = BS2.pack $ map fromIntegral is -- :: ByteString
                          , bases            = fromStr cs -- :: SeqData
                          , quality          = BL.pack qs -- :: QualData
                          }
