{- Modelling 454 sequencing
   (or Roche-something, as it is now called)

   From Marguiles M. et al:
    insertion error rate     1.67%
    deletion error rate      1.60%
    substitution error rate  0.68%

   No quantification of homopolymer length errors.

   Primer is uniform, this is even more so for 454 than for Sanger
   (but the paper has a more elaborate stats to be incorporated: TODO)
-}

module R454 where

import UnfoldMut
import Bio.Sequence.SeqData (Offset, Sequence)

r454_dist :: Offset -> Prob
r454_dist = (*0.02) . uniform     --  2% base rate of error

r454_hp :: MState -> Prob
r454_hp   = (*0.2) . homopolymer -- 0-40% chance for 0..20+ equal bases

basic, hp :: [Mutator]
basic = mkmut r454_dist (subst "ACGTN" ++ ins "ACGTN" ++ (concat $ replicate 5 del))

-- | Depending on homopolymer length, duplicate or delete
-- hmm..ugly
hp = [\m -> let (s,m') = (head dup) m in (r454_hp m,s,m')
     ,\m -> let (s,m') = (head del) m in (r454_hp m,s,m')]

terminator :: MState -> Prob
terminator (MS _ _ l) = gradient 90 150 $ fromIntegral l

r454 :: [String] -> ([Sequence] -> IO [MState], [Mutator], MState -> Prob)
r454 = \[n,d] -> (p_uniform [n,d],basic++hp,terminator)