1 -- Tests for alignments
    2 
    3 module Bio.Alignment.Test where
    4 import Bio.Sequence
    5 import Bio.Util.TestBase
    6 import Bio.Alignment.AlignData (showalign)
    7 import Bio.Alignment.Matrices as M
    8 
    9 import Test.QuickCheck
   10 import Bio.Alignment.QAlign as Q
   11 import Bio.Alignment.AAlign as A
   12 
   13 import Bio.Alignment.AlignData (toStrings, extractGaps, insertGaps)
   14 import Bio.Alignment.Multiple
   15 
   16 import Data.List (intersperse,sort,nub)
   17 import qualified Data.ByteString.Lazy as B
   18 import qualified Data.ByteString.Lazy.Char8 as BC
   19 
   20 -- default blastn parameters
   21 mx = Q.qualMx
   22 amx = M.blastn_default
   23 g  = (-5,-2)
   24 
   25 tests :: [Test]
   26 --           .........o.........o.........o
   27 tests = [ T "gapped assembly invariant" prop_asm_gaps
   28 
   29         , T "global reverse"        (prop_reverse_g mx g)
   30         , T "global reverse w/qual" (prop_reverse_g_qual mx g)
   31         , T "local reverse"         (prop_reverse_l mx g)
   32         , T "local reverse w/qual"  (prop_reverse_l_qual mx g)
   33         , T "global recovery"       (prop_recover_g mx g)
   34 
   35         , T "global score <= local" (prop_global_local mx g)
   36         , T "global score <= overlap" (prop_global_overlap mx g)
   37         , T "overlap score <= local" (prop_overlap_local mx g)
   38 
   39         , T "overlap score = align" (prop_overlap_score_align mx g)
   40 
   41         , T "s == s => equal scores" (prop_equal_scores mx g)
   42 
   43         -- should give same result without n's(?)
   44         , T "global aalign = qualign" prop_AAlign_QAlign_g
   45         , T "local aalign = qualign"  prop_AAlign_QAlign_l
   46 
   47         -- are these really correct?
   48         , T "decr qual => decr score (l)" (prop_quality_dec_l mx (-100,-100))
   49         , T "decr qual => decr score (g)" (prop_quality_dec_g mx (-100,-100))
   50 
   51         , T "indirect recover (g)" prop_indirect_recover
   52         ]
   53 
   54 -- pretty ugly, to ensure the gaps are correctly generated
   55 prop_asm_gaps :: (String,[Int]) -> Bool
   56 prop_asm_gaps (s,gs) = let (str,gaps) = (BC.pack $ filter (/='-') s
   57                                         , nub $ takeWhile (<= BC.length str) $ dropWhile (<0) $ map fromIntegral $ sort gs)
   58                        in (str,gaps) == (extractGaps . insertGaps '*' $ (str,gaps))
   59 
   60 -- these aren't entirely equivalent, as the different column generators or selector
   61 -- sometimes make different, but equally scoring, choices.  Should be fixed, but in
   62 -- the meantime, checking scores to within an epsilot should be okay.
   63 -- (May fail if two n's are aligned?)
   64 prop_AAlign_QAlign_g (E s1) (E s2) = 
   65     let qmx = qualMx 22 22
   66         a = (A.global_align qmx g s1 s2)
   67         q = (Q.global_align mx g s1 s2)
   68     in if abs (fst a - fst q) < 0.001 then True 
   69        else error ("\n"++show (fst a)++"\n"++ showalign (snd a)
   70                    ++"\n"++show (fst q) ++ "\n"++ showalign (snd q))
   71 
   72 prop_AAlign_QAlign_l (E s1) (E s2) =
   73     let qmx = qualMx 22 22
   74         a = (A.local_align qmx g s1 s2)
   75         q = (Q.local_align mx g s1 s2)
   76     in if abs (fst a - fst q) < 0.001 then True 
   77        else error ("\n"++show (fst a)++"\n"++ showalign (snd a)
   78                    ++"\n"++show (fst q) ++ "\n"++ showalign (snd q))
   79 
   80 -- Check reverse without quality values
   81 prop_reverse_g mx g (E s1) (E s2) =
   82     abs (Q.global_score mx g s1 s2 
   83          - Q.global_score mx g (revcompl s1) (revcompl s2)) < 0.1
   84 
   85 prop_reverse_l mx g (E s1) (E s2) =
   86     abs (Q.local_score mx g s1 s2 
   87          - Q.local_score mx g (revcompl s1) (revcompl s2)) < 0.1
   88 
   89 -- Check reverse with quality
   90 prop_reverse_g_qual mx g (Eq s1) (Eq s2) =
   91     abs (Q.global_score mx g s1 s2 
   92            - Q.global_score mx g (revcompl s1) (revcompl s2)) < 0.1
   93 
   94 prop_reverse_l_qual mx g (Eq s1) (Eq s2) =
   95     abs (Q.local_score mx g s1 s2 
   96            - Q.local_score mx g (revcompl s1) (revcompl s2)) < 0.1
   97 
   98 -- Check that alignments are produced from the correct sequences
   99 prop_recover_g mx g (Eq s1) (Eq s2) =
  100     let (s1',s2') = toStrings $ snd $ Q.global_align mx g s1 s2
  101     in filter (/='-') s1' == toStr (seqdata s1)
  102            && filter (/='-') s2' == toStr (seqdata s2)
  103 
  104 -- global <= overlap <= local
  105 -- Global score never exceeds optimal local score
  106 prop_global_local mx g (Eq s1) (Eq s2) =
  107     Q.global_score mx g s1 s2 <= Q.local_score mx g s1 s2
  108 
  109 -- Global score never exceeds optimal overlap score
  110 prop_global_overlap mx g (Eq s1) (Eq s2) =
  111     Q.global_score mx g s1 s2 <= Q.overlap_score mx g s1 s2
  112 
  113 -- Overlap score never exceeds optimal local score
  114 prop_overlap_local mx g (Eq s1) (Eq s2) =
  115     Q.overlap_score mx g s1 s2 <= Q.local_score mx g s1 s2
  116 
  117 prop_overlap_score_align mx g (Eq s1) (Eq s2) =
  118     abs (Q.overlap_score mx g s1 s2 - fst (Q.overlap_align mx g s1 s2)) < 0.1
  119 
  120 prop_equal_scores mx g (Eq s1) = 
  121     let qg = Q.global_score mx g s1 s1
  122         ql = Q.local_score mx g s1 s1
  123         qo = Q.overlap_score mx g s1 s1
  124     in qg == ql && ql == qo
  125 
  126 -- Sinking score with sinking quality - local (same as global)
  127 prop_quality_dec_l mx g (E (Seq h d _)) = let 
  128     q10 = Just $ B.map (const 10) d
  129     q20 = Just $ B.map (const 20) d
  130     in Q.local_score mx g (Seq h d q10) (Seq h d q10) 
  131            <= Q.local_score mx g (Seq h d q20) (Seq h d q20)
  132 
  133 -- Sinking score with sinking quality - global
  134 -- NB: score is always positive (aligning seq. with itself)
  135 prop_quality_dec_g mx g (E (Seq h d _)) = let 
  136     q10 = Just $ B.map (const 10) d
  137     q20 = Just $ B.map (const 20) d
  138     in Q.global_score mx g (Seq h d q10) (Seq h d q10)
  139            <= Q.global_score mx g (Seq h d q20) (Seq h d q20)
  140 
  141 -- Sinking avg score with sinking quality 
  142 -- This is not correct, low quality reduces the penalty for mismatch
  143 -- much harder than the reward for a match.  
  144 -- (E.g. mmmxmmm may score higher with poor quality)
  145 prop_quality_dec2 mx g (E (Seq h d _)) (E (Seq h' d' _)) = let 
  146     q10 = Just $ B.map (const 10) d
  147     q20 = Just $ B.map (const 20) d
  148     (s1,a1) = Q.local_align mx g (Seq h d q10) (Seq h' d' q10) 
  149     (s2,a2) = Q.local_align mx g (Seq h d q20) (Seq h' d' q20)
  150     in if null a1 && null a2 then True 
  151        else s1 / fromIntegral (length a1) <= s2 / fromIntegral (length a2)
  152 
  153 -- not expected to pass(?), but useful to debug indirect alignments
  154 prop_indirect (E s1) (E s2) (E s3) = let a1 = snd $ A.global_align amx g s1 s2
  155                                          a2 = snd $ A.global_align amx g s2 s3
  156                                          a3 = snd $ A.global_align amx g s1 s3
  157                                          i1 = indirect a1 a2
  158                                      in if a3 == i1 then True 
  159                                         else error ("\n"++unlines [showalign a1, showalign a2,"",showalign a3,"",showalign i1])
  160 
  161 prop_indirect_recover (E s1) (E s2) (E s3) = 
  162     let a1 = snd $ A.global_align amx g s1 s2
  163         a2 = snd $ A.global_align amx g s2 s3
  164         i1 = indirect a1 a2
  165         (s1',s3') = toStrings i1
  166         f = filter (/= '-')
  167     in if f s1' == toStr (seqdata s1) && f s3' == toStr (seqdata s3)
  168        then True else error ("\n"++(unlines [showalign a1, showalign a2,"",showalign i1]))
  169 
  170 {-
  171 -- | Affine = Simple when go == ge
  172 prop_affine1 s1 s2 = S.global_align mx g s1 s2 == A.global_align mx (g,g) s1 s2
  173 prop_affine2 s1 s2 = S.global_score mx g s1 s2 == A.global_score mx (g,g) s1 s2
  174 -}
  175 
  176 {-
  177 -- | For a given scoring scheme, scoring must correspond to alignments
  178 prop_score1 mx g (Eq s1) (Eq s2) = 
  179     sum . S.eval mx g . snd . S.global_align mx g s1 $ s2 
  180     == S.global_score mx g s1 s2
  181 -- -> todo: ditto for A and Q, ditto for local
  182 -}
  183 
  184 {-
  185 -- | Any global alignment should return the exact same sequences
  186 prop_recover mx g s1 s2 = undefined
  187 
  188 -- | Qual = Affine when no quality data are supplied
  189 prop_qual s1 s2 = undefined
  190 
  191 prop_local s1 s2 = S.global_score mx g s1 s2 <= S.local_score mx g s1 s2
  192 -- -> todo: ditto for A and Q
  193 
  194 -- | Verify that all columns have the same lenght (ie. it should be a square matrix)
  195 --   (Check vs sequence lengths)  Both QAlign.columns and AlignData.columns
  196 prop_columns = undefined
  197 -}
  198 
  199 
  200 bench :: [Test]
  201 --           .........o.........o.........o
  202 bench = [ T "local score, short" bench_local_rev_s
  203         , T "overlap score, short" bench_overlap_rev_s
  204         , T "global score, short" bench_global_rev_s
  205 
  206         , T "local score, long" bench_local_rev_l
  207         , T "overlap score, long" bench_overlap_rev_l
  208         , T "global score, long" bench_global_rev_l
  209         ]
  210 
  211 bench_global_rev_s (ES x) (ES y) = 
  212     abs (Q.global_score mx g x y
  213          - Q.global_score mx g (revcompl x) (revcompl y)) < 0.1
  214 
  215 bench_local_rev_s (ES x) (ES y) = 
  216     abs (Q.local_score mx g x y
  217          - Q.local_score mx g (revcompl x) (revcompl y)) < 0.1
  218 
  219 bench_overlap_rev_s (ES x'@(Seq h d _)) (ES y'@(Seq h2 d2 _)) = 
  220     let x = Seq h d Nothing
  221         y = Seq h2 d2 Nothing
  222         f = Q.overlap_align mx g x y
  223         r = Q.overlap_align mx g (revcompl x) (revcompl y)
  224     in if abs (fst f-fst r) < 0.1 then True
  225     else error ("\nFwd: "++show (fst f)++"\n"++ showalign (snd f)
  226                    ++"\nRev: "++show (fst r)++"\n"++ showalign (snd r)
  227                    ++"\n"++toStr (seqdata x)++"\n"++(toStr $ seqdata y))
  228 
  229 bench_overlap_rev_l (EL x) (EL y) = 
  230     abs (Q.overlap_score mx g x y
  231          - Q.overlap_score mx g (revcompl x) (revcompl y)) < 0.1
  232 
  233 bench_global_rev_l (EL x) (EL y) = 
  234     abs (Q.global_score mx g x y
  235          - Q.global_score mx g (revcompl x) (revcompl y)) < 0.1
  236 
  237 bench_local_rev_l (EL x) (EL y) = 
  238     abs (Q.local_score mx g x y
  239          - Q.local_score mx g (revcompl x) (revcompl y)) < 0.1