{- | FlowSim - simulate 454 flowgrams from a reference genome 

   Flow values are modelled from a log-normal noise, and a sequence of
   normal distributions, one for each homopolymer length.  These distributions
   can be estimated from real data.

   Todo:
   * Paired ends models B-linker-A
   * Support multiple sequences:
      - scan seqs + lenghts
      - gen n numbers (0..total length), transform to deltas (cf. rselect)
      - sort and generate

-}

module Main where

import Bio.Core.Sequence
import Bio.Sequence.Fasta
import Bio.Sequence.SFF

import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BL
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString as BS2

import Data.Char (toUpper)
import Data.Int

import Statistics
import Generations.GenBase
import Config

import Control.Monad (when)

-- ------------------------------------------------------------
-- picking random postions/directions for reads

data Dir   = Fwd | Rev deriving (Eq,Enum,Bounded)

instance Random Dir where 
  random g = let (b,g') = random g in (if b then Fwd else Rev,g')
  randomR = error "randomR is undefined for Dir"

main :: IO ()
main = do
  (gen,is,hplc,o) <- mkconf
  
  ss <- is
  case ss of [] -> error "Input appears to be empty?"
             _ -> return ()
  sff <- evalRandIO $ sim454 gen hplc ss
  n <- writeSFF' o sff
  whenLoud $ inform ("Wrote "++show n++" reads to '"++o++"'.")

-- testing
test_s :: Sequence
test_s = Seq (SeqLabel $ B.pack "foo") (SeqData $ B.pack "aacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacggaacacattcgtggtnagctacgg") Nothing

-- | Heavy lifting.  Building an SFF file from the specified information
sim454 :: RandomGen g => Generation -> HPLprob -> [Sequence] -> Rand g SFF
sim454 gen hplc ss = do
  let ch = makeCommonHeader gen
      tf r = foldr ($) r (trim_primer (adapter gen) : trim_filters gen)
  rbs <- map tf `fmap` mapM (makeReadBlock gen hplc ch) ss
  return (SFF ch rbs)

-- Generate a sequence of models
-- The generation specifies a series of initial models, this function
-- uses these with the degradation to generate a progression of models to use  
makeModels :: RandomGen g => Generation -> Rand g [Model]  
makeModels gen = case models gen of
  [m] -> iterativeWorsen (f_len gen) (degrade gen) m
  im@(i1:is) -> do
    let (ns,rs) = f_len gen `divMod` length im
    -- error (show (ns,rs,length (replicate (ns+rs) i1 ++ concatMap (replicate ns) is)))
    return (replicate (ns+rs) i1 ++ concatMap (replicate ns) is)

iterativeWorsen :: RandomGen g => Int -> Distribution -> Model -> Rand g [Model]
iterativeWorsen 0 _ _ = return []
iterativeWorsen count deg m = do 
  ps <- perturbs 1 count
  return $ zipWith (fmap . worsen) ps $ replicate count m
    where perturbs _   0 = return []
          perturbs cur n = do 
            w <- sample deg
            let new = cur+w
            rest <- perturbs new (n-1)
            return (new : rest)

-- --------------------------------------------------
-- Generate the flow values

-- generate the "absolute" flow values from the cycle and the origin sequence data
makeFlows :: [Char] -> SeqData -> [(Char,Int)]
makeFlows c (SeqData s) 
  | B.null s = []
  | otherwise = let (c1,SeqData s1) = makeCycle [] (take 4 c) (SeqData s)
                    s2 = case B.uncons s1 of Just (x,_) -> if notElem (toUpper x) "ACGT" then B.tail s1 else s1; _ -> s1
                in c1 ++ makeFlows c (SeqData s2)

makeCycle :: [(Char,Int)] -> [Char] -> SeqData -> ([(Char,Int)],SeqData)
makeCycle acc []     s = (reverse acc, s)
makeCycle acc (c:cs) (SeqData s) = 
  let (this,rest) = B.span ((==toUpper c).toUpper) s
  in makeCycle ((c,fromIntegral $ B.length this):acc) cs (SeqData rest)

-- | Takes a sequence of flows with exact homopolymer lengths, 
-- permutes and calls them.  Next step is prepareData
permuteAndCall :: RandomGen g => Generation -> HPLprob -> [(Char,Int)] -> 
                  Rand g [(Flow,[Char],[Qual])]
permuteAndCall gen hplc fs = do
  -- when (trace (show fs) False) (return ())
  let n  = fromIntegral $ f_len gen
      -- tack on the correct number of zero flows after end of clone
      flow_cont = drop (length fs `mod` length (f_cycle gen)) $ cycle (f_cycle gen)
      xs = fs ++ zip flow_cont (replicate (n-length fs) 0)
  ms <- makeModels gen
  -- if and $ zipWith (==) (map fst xs) (cycle $ f_cycle gen) then return () 
  -- else error (map fst xs ++ "\n don't cycle: "++f_cycle gen)
  sequence $ take n $ zipWith (my_call (qcall gen) hplc) ms xs

-- Titanium base/quality calling
my_call :: RandomGen g => QualMethod -> HPLprob -> Model -> (Char,Int) -> 
           Rand g (Flow,[Char],[Qual])
my_call qm ft md (c,hpl) = do
  f <- sample (md hpl)
  let fi = max 0 . round . (100*) $ f
      n = fromIntegral ((fi+50) `div` 100)
      cs = replicate n c
      qs = qm ft md (c,fi)
  when (length qs /= n) $ error ("Qual mismatch: "++show (f,fi,n,cs,qs))
  return (fi, cs, qs)

-- | Somewhat advanced unzip, calculating the index in the process.
--   Also replace three consequtive empty lists with Ns
convertCalls :: [(Flow,[Char],[Qual])] -> ([Flow],[Char],[Qual],[Int])
convertCalls = go 0 . addDots
  where go _ [] = ([],[],[],[])
        go p ((f,cs,qs):fs) 
          | null cs = let (f',c',q',i') = go (p+1) fs
                      in (f:f',c',q',i')
          | otherwise = let (f',c',q',i') = go 0 fs
                        in (f:f',cs++c',qs++q',(p+1):replicate (length cs-1) 0 ++ i')
        
        addDots ((f1,cs1,qs1):(f2,cs2,qs2):(f3,cs3,qs3):(f4,cs4,qs4):rest)
            | null cs1 && null cs2 && null cs3 && not (null cs4) = (f1,cs1,qs1):(f2,cs2,qs2):(f3,"N",[0]):addDots ((f4,cs4,qs4):rest)
            | otherwise = (f1,cs1,qs1) : addDots ((f2,cs2,qs2):(f3,cs3,qs3):(f4,cs4,qs4):rest)
        addDots fewerThanThree = fewerThanThree

-- | Generate a ReadBlock
--   direction and position chosen at random, and encoded in the read name
makeReadBlock :: RandomGen g => Generation -> HPLprob -> CommonHeader -> Sequence -> Rand g ReadBlock
makeReadBlock g hplc ch sq = do
       let sdata = seqdata sq
           rn = BS.concat $ BL.toChunks $ unSL $ seqlabel sq
           fs = makeFlows (BS.unpack $ flow ch) sdata
       (pfs,cs,qs,is) <- convertCalls `fmap` permuteAndCall g hplc fs
       return $ verifyRB (flow_length ch) ReadBlock {
                           read_header = ReadHeader {
                                           name_length                 = fromIntegral $ BS.length rn -- :: Int16
                                         , num_bases                   = fromIntegral $ length cs    -- :: Int32
                                         , clip_qual_left              = 5                           -- :: Int16
                                         , clip_qual_right             = fromIntegral $ length cs    -- :: Int16
                                         , clip_adapter_left           = 0 -- :: Int16
                                         , clip_adapter_right          = 0 -- :: Int16
                                         , read_name                   = rn -- :: ByteString
                                         }
                         -- The data block
                          , flow_data        = packFlows pfs    -- :: [Flow]
                          , flow_index       = BS2.pack $ map fromIntegral is -- :: ByteString
                          , bases            = SeqData $ B.pack cs
                          , quality          = QualData $ BL.pack $ map unQual qs
                          }

-- | Consistency check on generated ReadBlocks.
verifyRB :: Int16 -> ReadBlock -> ReadBlock
verifyRB fl rb 
            -- | name_length rh == 0  = err "name_length is zero"
            | num_bases rh == 0    = err "num_bases is zero"
--            | clip_qual_left rh > clip_qual_right rh = err "clipping gives negative sequence"
            | BS2.length (read_name rh) /= fromIntegral (name_length rh)  = err "read_name has incorrect length"
            | BS.length (flow_data rb) `div` 2 /= fromIntegral fl  = err ("Number of flows ("++show (BS.length (flow_data rb) `div` 2)++")do not match flow_length of "++show fl++" in CommonHeader")
            | BS.length (flow_data rb) `div` 2 < fromIntegral (sum $ BS2.unpack $ flow_index rb) = err "flow_index longer than flows"
            | (B.length $ unSD $ bases rb) /= fromIntegral (BS.length $ flow_index rb)
              = err ("bases ("++show (B.length $ unSD $ bases rb)++") and flow_index ("++show (BS.length $ flow_index rb)++") have different lengths")
            | (B.length $ unQD $ quality rb) /= fromIntegral (BS.length $ flow_index rb) 
              = err ("quality ("++show (B.length $ unQD $ quality rb)++") and flow_index ("++show (BS.length $ flow_index rb)++") have different lengths")
            | otherwise = rb
    where rh = read_header rb 
          err str = error (str ++ "\n" ++ show rb)
