{- Modelling 454 sequencing
(or Roche-something, as it is now called)
From Marguiles M. et al:
insertion error rate 1.67%
deletion error rate 1.60%
substitution error rate 0.68%
No quantification of homopolymer length errors.
Primer is uniform, this is even more so for 454 than for Sanger
(but the paper has a more elaborate stats to be incorporated: TODO)
-}
module R454 where
import UnfoldMut
import Bio.Sequence.SeqData (Offset, Sequence)
r454_dist :: Offset -> Prob
r454_dist = (*0.02) . uniform -- 2% base rate of error
r454_hp :: MState -> Prob
r454_hp = (*0.2) . homopolymer -- 0-40% chance for 0..20+ equal bases
basic, hp :: [Mutator]
basic = mkmut r454_dist (subst "ACGTN" ++ ins "ACGTN" ++ (concat $ replicate 5 del))
-- | Depending on homopolymer length, duplicate or delete
-- hmm..ugly
hp = [\m -> let (s,m') = (head dup) m in (r454_hp m,s,m')
,\m -> let (s,m') = (head del) m in (r454_hp m,s,m')]
terminator :: MState -> Prob
terminator (MS _ _ l) = gradient 90 150 $ fromIntegral l
r454 :: [String] -> ([Sequence] -> IO [MState], [Mutator], MState -> Prob)
r454 = \[n,d] -> (p_uniform [n,d],basic++hp,terminator)