{- Transcribe from a base sequence using unfoldr

   Probably more complicated than necessary.

   'prime' selects starting points (in the form of initial state)
   'transcribe' takes a list of mutators, a terminator, and an
   initial state and generates the transcript.
-}

module UnfoldMut where

import Bio.Sequence hiding ((!))
import Data.Char (toUpper) -- for testing
import Data.Int
import Data.List (sort)
import Data.List (unfoldr)
import Data.Maybe (isJust)
import System.Random
import qualified Bio.Sequence as S

at :: Cursor -> Char
at (s,o,d) | o < 0 || o >= seqlength s = 'X'
            | d == Fwd                  = (S.!) s o
            | otherwise                 = compl $ (S.!) s o

type Prob = Double

-- | The cursor tracks position in the sequence (simulating polymerase)
data Direction = Fwd | Rev deriving (Eq,Show)
type Cursor = (Sequence,Offset,Direction)

-- | A primer determines when to initiate transcripts
type Primer = [Sequence] -> IO [MState]

------------------------------------------------------------
-- | A mutator inspects the state, returns the probability of an action,
--   and the action, in the form of a string and a modified state
type Mutator = MState -> (Prob,[Char], MState)
type MutFunc = MState -> ([Char],MState)

combine :: [Distribution] -> Distribution
combine = foldr (\f g -> \x -> f x + g x) (const 0)

mkmut :: Distribution -> [MutFunc] -> [Mutator]
mkmut dist fs =
    map (\f -> \m -> let (s,m') = f m in (t m, s, m')) fs
    where t = (/fromIntegral (length fs)) . dist . tlen
          tlen (MS _ _ l) = fromIntegral l

-- | Uniform probs for substitution and insertion
subst, ins :: String -> [MutFunc]
subst = map (\x -> \m -> ([x],fwd m))
ins   = map (\x -> \m -> ([x],m))

-- | Deletion of current base
del :: [MutFunc]
del     = [\m -> ([],fwd m)]

-- | Duplicate current base without forwarding
dup :: [MutFunc]
dup = [\m@(MS _ c _) -> ([toUpper $ at c],m)]

------------------------------------------------------------
-- | The terminator examines the state, and the probablity of
--   transcript termination
type Terminator = MState -> Prob

-- | A Model is shorthand for a complete parameter set
type Model = (Primer,[Mutator],Terminator)

-- | The state of the current transcription
data MState = MS StdGen Cursor Int
instance Show MState where show = const "xxx"

------------------------------------------------------------
-- | A distribution is a probability as a function of length
type Distribution = Offset -> Prob

-- | The sigmoid function provides a soft threshold
--   center on s, scale by w
sigma :: Double -> Double -> Distribution
sigma s w = \t -> 1/(1+exp ((s-fromIntegral t)/w))

uniform :: Distribution
uniform = const 1

gradient :: Double -> Double -> Distribution
gradient s e = \t -> max 0 . min 1 $ (fromIntegral t-s)/(e-s)

-- | For 454 (and other?) sequencing, we need a distribution
--   that depends on sequence content
homopolymer :: MState -> Prob
homopolymer (MS _ c@(s,o,_) _) =
    (/20) . fromIntegral . length . takeWhile (== at c) .
    map ((S.!) s) $ [o..min (o+19) (seqlength s-1)]

------------------------------------------------------------
-- | Increments the cursor in the MState
fwd :: MState -> MState
fwd (MS sg (s,o,d) l) = (MS sg (s,if d == Fwd then o+1 else o-1,d) l)

transcribe :: [Mutator] -> Terminator -> MState -> [Char]
transcribe ms t is = concat . unfoldr (sequencer ms t) $ is

-- | walk a list of mutator funcitions, pick one based on p
-- whoa! this is almost :: [Mutator] -> Mutator!
sequencer :: [Mutator] -> Terminator -> MState -> Maybe ([Char],MState)
sequencer ms t =
  \m@(MS g c _) ->
          let (p,g') = randomR (0,1) g
              tp = t m
              new = mutate m (p-tp) ms
          in if p < tp then Nothing
             else update_ms g' $ if isJust new then new
                                 else Just ([at c], fwd m)

-- | update rndgen and length
update_ms :: StdGen -> Maybe (String,MState) -> Maybe (String,MState)
update_ms g (Just (xs, MS _ c l)) = Just (xs, MS g c (l+length xs))
update_ms _ Nothing = error "this is impossible"

-- | try all mutators in a list
mutate :: MState -> Prob -> [Mutator] -> Maybe (String,MState)
mutate st p (m:ms) = let (t,ch,st') = m st
                         p' = p-t
                     in if p' < 0 then Just (ch,st')
                        else mutate st p' ms
mutate _st _p [] = Nothing

------------------------------------------------------------
-- | A primer with a uniform (poisson) distribution
p_uniform :: [String] -> Primer
p_uniform [n,d] = \ss -> do
    (g,g') <- return . split =<< newStdGen
    let l  = sum $ map seqlength ss
        os = sort . take (read n) . randomRs (0,l-1) $ g
    return (gen_ms (read d) g' 0 ss os)
p_uniform _ = error "primer 'uniform' needs two arguments"

-- | Generate MStates for transcript initiation
gen_ms :: Double -> StdGen -> Offset -> [Sequence] -> [Offset] -> [MState]
gen_ms d g c (s:ss) (p:ps)
  | p-c >= seqlength s = gen_ms d g (c+seqlength s) ss (p:ps)
  | otherwise          = let (g1,g') = split g
                             (z,g2) = randomR (0,1) g1
                         in MS g2 (s,p-c,if z<d then Fwd else Rev) 0
                                : gen_ms d g' c (s:ss) ps
gen_ms _ _ _ _ [] = []
gen_ms _ _ _ [] _ = error "Impossible: gen_ms ran out of sequences?!"

-- ripped from http://darcs.haskell.org/binary/tests/QuickCheckUtils.hs

instance Random Int64 where
  randomR = integralRandomR
  random = randomR (minBound,maxBound)

integralRandomR :: (Integral a, RandomGen g) => (a,a) -> g -> (a,g)
integralRandomR  (a,b) g = case randomR (fromIntegral a :: Integer,
                                         fromIntegral b :: Integer) g of
                            (x,g') -> (fromIntegral x, g')
