1 {- |
    2    Quality-aware alignments
    3 
    4    Generally, quality data are ignored for alignment\/pattern searching
    5    like Smith-Waterman, Needleman-Wunsch, or BLAST(p|n|x).  I believe
    6    that accounting for quality will at the very least affect things like
    7    BLAST statistics, and e.g. is crucial for good EST annotation using Blastx.
    8 
    9    This module performs sequences alignments, takes quality values into
   10    account.
   11 
   12    See also <http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btn052v1>.
   13 -}
   14 {-# LANGUAGE CPP, ParallelListComp #-}
   15 {-# OPTIONS_GHC -fexcess-precision #-}
   16 
   17 -- #define DEBUG
   18 
   19 module Bio.Alignment.QAlign (
   20    -- * Smith-Waterman
   21    -- | Locally optimal alignment with affine gaps, i.e. best infix match.
   22    local_score, local_align
   23 
   24    -- * Needleman-Wunsch 
   25    -- | Globally optimal alignment with affine gaps, the whole sequences are matched.
   26    , global_score, global_align
   27 
   28    -- * Overlapping alignment.
   29    -- | The suffix of one sequence matches a prefix of another.
   30    , overlap_score, overlap_align
   31 
   32    -- * Matrix construction
   33    , qualMx
   34 
   35    -- * Interactive testing of alignments
   36    , test
   37    ) where
   38 
   39 import Data.List (maximumBy,partition,unfoldr,zip4,tails)
   40 import qualified Data.ByteString.Lazy as B
   41 import qualified Data.ByteString.Lazy.Char8 as BC
   42 import Data.Array.Unboxed
   43 
   44 import Bio.Sequence.SeqData hiding ((!))
   45 import Bio.Alignment.AlignData (Chr,Edit(..),SubstMx,EditList,on,isRepl,showalign)
   46 
   47 -- | The selector must take into account the quality of the sequences
   48 --   on Ins\/Del, the average of qualities surrounding the gap is (should be) used
   49 type QSelector a = [(a,Edit,Qual,Qual)] -> a
   50 
   51 -- using 21 as default corresponds to blastn default ratio (1,-3)
   52 columns :: QSelector a -> a -> Sequence -> Sequence -> [[a]]
   53 columns sel z s1 s2 = columns' sel (z,r0,c0) s1' s2'
   54     where (s1',s2') = (zup s1, zup s2)
   55           zup :: Sequence -> [(Chr,Qual)]
   56           zup (Seq _ sd Nothing) = map (\c -> (c,22)) $ B.unpack sd
   57           zup (Seq _ sd (Just qd)) = zip (B.unpack sd) (B.unpack qd)
   58 
   59           -- the first row consists of increasing numbers of deletions
   60           r0 = map (sel . return)
   61                (zip4 (z:r0) (map (Del . fst) s1') (repeat (snd $ head s2')) (map snd s1'))
   62           -- the first column consists of increasing numbers of insertions
   63           c0 = map (sel . return)
   64                (zip4 (z:c0) (map (Ins . fst) s2') (repeat (snd $ head s1')) (map snd s2'))
   65 
   66 columns' :: QSelector a -> (a,[a],[a]) -> [(Chr,Qual)] -> [(Chr,Qual)] -> [[a]]
   67 columns' f (topleft,top,left) s1 s2 = let
   68         c0 = (topleft : left)
   69         -- given the previous column, and the remains of s2, calculate the next column
   70         mkcol (ts, p0:prev, x) = if null x then Nothing else
   71                let (xi,qi) = head x
   72                    c  = head ts : [f [del,ins,rep]
   73                           | del <- zip4 prev (repeat $ Del xi) (repeat qi) (avg2 $ map snd s2)
   74                           | ins <- zip4 c (map (Ins . fst) s2) (repeat $ head $ avg2 $ map snd x) (map snd s2)
   75                           | rep <- zip4 (p0:prev) (map (Repl xi . fst) s2) (repeat qi) (map snd s2)]
   76                in Just (c, (tail ts, c, tail x))
   77     in c0 : unfoldr mkcol (top,c0,s1)
   78 
   79 avg2 :: [Qual] -> [Qual]
   80 avg2 = map f . tails
   81     where f (x1:x2:_) = (x1+x2) `div` 2
   82           f [x] = x
   83           f _ = error "Nasty - incorrect column lenght"
   84 
   85 
   86 
   87 -- | Minus infinity (or an approximation thereof)
   88 minf :: Double
   89 minf = -100000000
   90 
   91 type QualMx a = Qual -> Qual -> SubstMx a
   92 
   93 qualMx :: Qual -> Qual -> (Chr,Chr) -> Double
   94 qualMx q1 q2 (x,y) = if isN x || isN y then 0.0 else
   95 #ifdef DEBUG
   96                         if q1 < 0 || q1 > 99 || q2 < 0 || q2 > 99
   97                         then error ("Qualities out of range: "++show (q1,q2))
   98                         else
   99 #endif
  100                         if x==y || x+32==y || x-32==y
  101                         then matchtbl!(q1,q2) else mismatchtbl!(q1,q2)
  102   where matchtbl, mismatchtbl :: UArray (Qual,Qual) Double
  103         matchtbl = array ((0,0),(99,99))    [((x,y),adjust True x y) | x <- [0..99], y <- [0..99]]
  104         mismatchtbl = array ((0,0),(99,99)) [((x,y),adjust False x y) | x <- [0..99], y <- [0..99]]
  105         isN c = c `elem` [78,88,110,120]
  106 
  107 adjust :: Bool -> Qual -> Qual -> Double
  108 adjust s q1 q2 =
  109     let fromQual x = 10**(-fromIntegral x/10)
  110         e1 = fromQual q1
  111         e2 = fromQual q2
  112         e  = (e1+e2-4/3*e1*e2)
  113     in logBase 2 (if s then 4*(1-e) else 4/3*e)
  114 
  115 -- ------------------------------------------------------------
  116 -- Edit distances
  117 
  118 -- | Calculate global edit distance (Needleman-Wunsch alignment score)
  119 global_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double
  120 global_score mx g s1 s2 = uncurry max . last . last
  121                           $ columns (score_select minf mx g) (0,fst g) s1 s2
  122 
  123 -- | Calculate local edit distance (Smith-Waterman alignment score)
  124 local_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double
  125 local_score mx g s1 s2 = maximum . map (uncurry max) . concat
  126                          $ columns (score_select 0 mx g) (0,fst g) s1 s2
  127 
  128 -- | Calucalte best overlap score, where gaps at the edges are free
  129 --   The starting point is like for local score (0 cost for initial indels),
  130 --   the result is the maximum anywhere in the last column or bottom row of the matrix.
  131 
  132 -- Oh. local score_select will not work, since we cannot replace any matrix entry
  133 -- with zero in order to initiate a new alignment.  So we need 'minf', except in the
  134 -- initial row/column.  Damn.
  135 overlap_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double
  136 overlap_score mx g s1 s2 = maximum $ map (uncurry max) $ sel cols
  137     where cols   = columns overlap_score_select (0,fst g) s1 s2 
  138           sel cs = map last cs ++ last cs
  139           -- well - edges have less than three options, so we can set them to zero
  140           overlap_score_select cds@[_,_,_] = score_select minf mx g cds
  141           overlap_score_select [_] = (0,minf)
  142 
  143 -- | Generic scoring and selection function for global and local scoring
  144 score_select :: Double -> QualMx Double -> (Double,Double) -> QSelector (Double,Double)
  145 score_select minf mx (go,ge) cds =
  146     let (reps,ids) = partition (isRepl.snd') cds
  147         s = maximum $ minf:[max sub gap + mx q1 q2 (x,y) | ((sub,gap),Repl x y,q1,q2) <- reps]
  148         g = maximum $ minf:[max (sub+go) (gap+ge) | ((sub,gap),_op,_q1,_q2) <- ids]
  149     in s `seq` g `seq` (s,g) -- (s,g) 
  150              -- seq makes local slower, but overlap faster(!?)
  151 
  152 -- ------------------------------------------------------------
  153 -- Alignments (rip from AAlign)
  154 
  155 -- maximum...
  156 max' :: (Double,EditList) -> (Double,EditList) -> (Double,EditList)
  157 max' (x,ax) (y,yx) = if x>=y then (x,ax) else (y,yx)
  158 
  159 -- ... and addition for compound values
  160 fp :: (Double,EditList) -> (Double,Edit) -> (Double,EditList)
  161 fp (x,ax) (s,e) = (x+s,e:ax)
  162 
  163 -- | Calculate global alignment (Needleman-Wunsch)
  164 global_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList)
  165 global_align mx g s1 s2 = revsnd . uncurry max' . last . last
  166                $ columns (align_select minf mx g) ((0,[]),(fst g,[])) s1 s2
  167 
  168 -- | Calculate local alignment (Smith-Waterman)
  169 --   (can we replace uncurry max' with fst - a local alignment must always end on a subst, no?)
  170 local_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double, EditList)
  171 local_align mx g s1 s2 = revsnd . maximumBy (compare `on` fst)
  172                          . map (uncurry max') . concat
  173                          $ columns (align_select 0 mx g) ((0,[]),(fst g,[])) s1 s2
  174 
  175 -- | Calucalte best overlap score, where gaps at the edges are free
  176 --   The starting point is like for local score (0 cost for initial indels),
  177 --   the result is the maximum anywhere in the last column or bottom row of the matrix.
  178 overlap_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList)
  179 overlap_align mx g s1 s2 = revsnd . maximumBy (compare `on` fst) . map (uncurry max') $ sel cols
  180     where cols   = columns overlap_align_select ((0,[]),(minf,[])) s1 s2 
  181           sel cs = map last cs ++ last cs
  182           -- again, the edges have less than three options, so we can set them to zero
  183           overlap_align_select cds@[_,_,_] = align_select minf mx g cds
  184           overlap_align_select [_] = ((0,[]),(minf,[]))
  185 
  186 -- | Variant that retains indels to retain the entire sequence in the result
  187 overlap_align' :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList)
  188 overlap_align' mx g s1 s2 = revsnd . maximumBy (compare `on` fst) . map (uncurry max') $ sel cols
  189     where cols   = columns overlap_align_select ((0,[]),(fst g,[])) s1 s2 
  190           sel cs = map last cs ++ last cs
  191           -- the old "fewer choices" trick
  192           overlap_align_select cds@[_,_,_] = align_select minf mx g cds
  193           overlap_align_select [(((s1,es1),(s2,es2)),e,_,_)] = ((0,e:es2),(0,e:es2))
  194 
  195 -- (maybe better to reverse the inputs for global?)
  196 revsnd (s,a) = (s,reverse a)
  197 
  198 -- | Generic scoring and selection for global and local alignment
  199 align_select :: Double -> QualMx Double -> (Double,Double) -> QSelector ((Double,EditList),(Double,EditList))
  200 align_select minf mx (go,ge) cds =
  201     let (reps,ids) = partition (isRepl.snd') cds
  202         s = maximumBy (compare `on` fst)
  203           $ (minf,[]):[max' sub gap `fp` (mx q1 q2 (x,y),e) | ((sub,gap),e@(Repl x y),q1,q2) <- reps]
  204         g = maximumBy (compare `on` fst)
  205           $ (minf,[]):[max' (sub `fp` (go,e)) (gap `fp` (ge,e)) | ((sub,gap),e,_q1,_q2) <- ids]
  206     in (s,g)
  207 
  208 snd' (_,x,_,_) = x
  209 
  210 test :: IO ()
  211 test = do
  212   putStrLn "Enter two strings:"
  213   s1' <- getLine
  214   s2' <- getLine
  215   let s1 = Seq (BC.pack "foo") (BC.pack s1') Nothing
  216       s2 = Seq (BC.pack "bar") (BC.pack s2') Nothing
  217       mx = qualMx
  218       g  = (-5,-2)
  219   let ga = global_align mx g s1 s2
  220       la = local_align mx g s1 s2
  221       oa = overlap_align mx g s1 s2
  222       or = overlap_align mx g (revcompl s1) (revcompl s2)
  223   putStrLn ("GLOBAL:   " ++ show (fst ga))
  224   putStrLn $ showalign $ snd ga
  225   putStrLn ("OVERLAP:  " ++ show (fst oa))
  226   putStrLn $ showalign $ snd oa
  227   putStrLn ("OVERLAP (rc):  " ++ show (fst or))
  228   putStrLn $ showalign $ snd or
  229   putStrLn ("LOCAL:    " ++ show (fst la))
  230   putStrLn $ showalign $ snd la