{- | CloneSim - fragment a genome sequence to simulate cloning
     Factored out from flowsim in order to separate clone generation
     from flow simulation and base calling.
-}

{-# Language DeriveDataTypeable #-}

module Main where

import Bio.Core.Sequence
import Bio.Sequence.Fasta

import Statistics
import Version

import System.IO (stdin,stdout,stderr,hPutStrLn)
import Control.Monad (forM,when)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BC
import Data.Int (Int64)
import System.Console.CmdArgs
import Data.Array

type Dist    = String
data Conf = Conf 
            { lengths :: Dist
            , count :: Int
            , input :: [FilePath]
            } deriving (Data,Typeable,Show)

opts :: Conf
opts = Conf {  
    lengths = "Uniform 400 800" &= help "model for clone lengths"     &= typ "DIST"
  , count = 10                  &= help "number of reads to generate" &= typ "INT"
  , input = def &= args &= typFile
  } &= program "clonesim"
    &= summary ("clonesim "++version)
    &= details ["Available distributions (DIST):",    
                "Uniform a b, Normal mu sigma, LogNormal mu sigma"]

data Dir   = Fwd | Rev deriving (Eq,Enum,Bounded)

instance Random Dir where 
  random g = let (b,g') = random g in (if b then Fwd else Rev,g')
  randomR = error "randomR is undefined for Dir"

main :: IO ()
main = do
  conf <- cmdArgs opts
  -- print conf
  let inf = case input conf of 
        []  -> hPutStrLn stderr "clonesim: reading from stdin" >> hReadFasta stdin
        [x] -> readFasta x
        _   -> error "Please specify only a single input file"
  seq inf $ return () -- force the above warning
  ss <- map defragSeq `fmap` inf
  when (null ss) $ error  "No sequences in input, exiting!"
  let sa = listArray (0,length ss-1) ss
      la = listArray (0,length ss) $ scanl (+) 0 $ map seqlength ss
  hWriteFasta stdout =<< evalRandIO (simulate conf sa la)

type SeqArray = Array Int (Sequence)
type LenArray = Array Int Offset

-- | the real 'main'
simulate :: RandomGen g => Conf -> SeqArray -> LenArray -> Rand g [Sequence]
simulate conf sa sl = do
  let ldist   = read $ lengths conf
      (_,alz) = bounds sl
      top     = fromIntegral (sl!alz)
  forM [1..count conf] $ const 
    (do n <- floor `fmap` sample (Uniform 0 top)
        let i = bsearch sl n
            p = n - sl!i
            s = sa!i
        l <- round `fmap` sample ldist
        dir <- getRandom
        return $ mkClone dir p (Offset l) s)
    
-- bsearch - returns largest array entry less than search value
bsearch :: LenArray -> Offset -> Int
bsearch a v = go (bounds a) 
  where go (x,y) | v >= a!y  = y
                 | y == x+1  = x
                 | a!m <= v  = go (m,y)
                 | a!m > v   = go (x,m)
               where m = (x+y) `div` 2

mkClone :: Dir -> Offset -> Offset -> Sequence -> Sequence
mkClone dir (Offset pos) (Offset len) (Seq (SeqLabel h) (SeqData s) mq) = 
      Seq (SeqLabel label) (SeqData sd) (qual mq)
    where label = BC.pack (BC.unpack h++":"++show pos++
                  (case dir of Fwd -> ":fwd"; Rev -> ":rev")++" clonesim")
          sd = B.take len
               . (case dir of Fwd -> snd; Rev -> revcompl' . fst) 
               . B.splitAt pos $ s
          qual Nothing = Nothing
          qual (Just (QualData qd)) = Just . QualData . B.take len 
               . case dir of Fwd -> fst; Rev -> snd 
               $ B.splitAt pos qd

revcompl' = BC.reverse . BC.map compl

compl :: Char -> Char
compl 'A' = 'T'
compl 'T' = 'A'
compl 'C' = 'G'
compl 'G' = 'C'
compl 'a' = 't'
compl 't' = 'a'
compl 'c' = 'g'
compl 'g' = 'c'
compl  x  =  x

defragSeq = id
