1 {- | 2 Quality-aware alignments 3 4 Generally, quality data are ignored for alignment\/pattern searching 5 like Smith-Waterman, Needleman-Wunsch, or BLAST(p|n|x). I believe 6 that accounting for quality will at the very least affect things like 7 BLAST statistics, and e.g. is crucial for good EST annotation using Blastx. 8 9 This module performs sequences alignments, takes quality values into 10 account. 11 12 See also <http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btn052v1>. 13 -} 14 {-# LANGUAGE CPP, ParallelListComp #-} 15 {-# OPTIONS_GHC -fexcess-precision #-} 16 17 -- #define DEBUG 18 19 module Bio.Alignment.QAlign ( 20 -- * Smith-Waterman 21 -- | Locally optimal alignment with affine gaps, i.e. best infix match. 22 local_score, local_align 23 24 -- * Needleman-Wunsch 25 -- | Globally optimal alignment with affine gaps, the whole sequences are matched. 26 , global_score, global_align 27 28 -- * Overlapping alignment. 29 -- | The suffix of one sequence matches a prefix of another. 30 , overlap_score, overlap_align 31 32 -- * Matrix construction 33 , qualMx 34 35 -- * Interactive testing of alignments 36 , test 37 ) where 38 39 import Data.List (maximumBy,partition,unfoldr,zip4,tails) 40 import qualified Data.ByteString.Lazy as B 41 import qualified Data.ByteString.Lazy.Char8 as BC 42 import Data.Array.Unboxed 43 44 import Bio.Sequence.SeqData hiding ((!)) 45 import Bio.Alignment.AlignData (Chr,Edit(..),SubstMx,EditList,on,isRepl,showalign) 46 47 -- | The selector must take into account the quality of the sequences 48 -- on Ins\/Del, the average of qualities surrounding the gap is (should be) used 49 type QSelector a = [(a,Edit,Qual,Qual)] -> a 50 51 -- using 21 as default corresponds to blastn default ratio (1,-3) 52 columns :: QSelector a -> a -> Sequence -> Sequence -> [[a]] 53 columns sel z s1 s2 = columns' sel (z,r0,c0) s1' s2' 54 where (s1',s2') = (zup s1, zup s2) 55 zup :: Sequence -> [(Chr,Qual)] 56 zup (Seq _ sd Nothing) = map (\c -> (c,22)) $ B.unpack sd 57 zup (Seq _ sd (Just qd)) = zip (B.unpack sd) (B.unpack qd) 58 59 -- the first row consists of increasing numbers of deletions 60 r0 = map (sel . return) 61 (zip4 (z:r0) (map (Del . fst) s1') (repeat (snd $ head s2')) (map snd s1')) 62 -- the first column consists of increasing numbers of insertions 63 c0 = map (sel . return) 64 (zip4 (z:c0) (map (Ins . fst) s2') (repeat (snd $ head s1')) (map snd s2')) 65 66 columns' :: QSelector a -> (a,[a],[a]) -> [(Chr,Qual)] -> [(Chr,Qual)] -> [[a]] 67 columns' f (topleft,top,left) s1 s2 = let 68 c0 = (topleft : left) 69 -- given the previous column, and the remains of s2, calculate the next column 70 mkcol (ts, p0:prev, x) = if null x then Nothing else 71 let (xi,qi) = head x 72 c = head ts : [f [del,ins,rep] 73 | del <- zip4 prev (repeat $ Del xi) (repeat qi) (avg2 $ map snd s2) 74 | ins <- zip4 c (map (Ins . fst) s2) (repeat $ head $ avg2 $ map snd x) (map snd s2) 75 | rep <- zip4 (p0:prev) (map (Repl xi . fst) s2) (repeat qi) (map snd s2)] 76 in Just (c, (tail ts, c, tail x)) 77 in c0 : unfoldr mkcol (top,c0,s1) 78 79 avg2 :: [Qual] -> [Qual] 80 avg2 = map f . tails 81 where f (x1:x2:_) = (x1+x2) `div` 2 82 f [x] = x 83 f _ = error "Nasty - incorrect column lenght" 84 85 86 87 -- | Minus infinity (or an approximation thereof) 88 minf :: Double 89 minf = -100000000 90 91 type QualMx a = Qual -> Qual -> SubstMx a 92 93 qualMx :: Qual -> Qual -> (Chr,Chr) -> Double 94 qualMx q1 q2 (x,y) = if isN x || isN y then 0.0 else 95 #ifdef DEBUG 96 if q1 < 0 || q1 > 99 || q2 < 0 || q2 > 99 97 then error ("Qualities out of range: "++show (q1,q2)) 98 else 99 #endif 100 if x==y || x+32==y || x-32==y 101 then matchtbl!(q1,q2) else mismatchtbl!(q1,q2) 102 where matchtbl, mismatchtbl :: UArray (Qual,Qual) Double 103 matchtbl = array ((0,0),(99,99)) [((x,y),adjust True x y) | x <- [0..99], y <- [0..99]] 104 mismatchtbl = array ((0,0),(99,99)) [((x,y),adjust False x y) | x <- [0..99], y <- [0..99]] 105 isN c = c `elem` [78,88,110,120] 106 107 adjust :: Bool -> Qual -> Qual -> Double 108 adjust s q1 q2 = 109 let fromQual x = 10**(-fromIntegral x/10) 110 e1 = fromQual q1 111 e2 = fromQual q2 112 e = (e1+e2-4/3*e1*e2) 113 in logBase 2 (if s then 4*(1-e) else 4/3*e) 114 115 -- ------------------------------------------------------------ 116 -- Edit distances 117 118 -- | Calculate global edit distance (Needleman-Wunsch alignment score) 119 global_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double 120 global_score mx g s1 s2 = uncurry max . last . last 121 $ columns (score_select minf mx g) (0,fst g) s1 s2 122 123 -- | Calculate local edit distance (Smith-Waterman alignment score) 124 local_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double 125 local_score mx g s1 s2 = maximum . map (uncurry max) . concat 126 $ columns (score_select 0 mx g) (0,fst g) s1 s2 127 128 -- | Calucalte best overlap score, where gaps at the edges are free 129 -- The starting point is like for local score (0 cost for initial indels), 130 -- the result is the maximum anywhere in the last column or bottom row of the matrix. 131 132 -- Oh. local score_select will not work, since we cannot replace any matrix entry 133 -- with zero in order to initiate a new alignment. So we need 'minf', except in the 134 -- initial row/column. Damn. 135 overlap_score :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> Double 136 overlap_score mx g s1 s2 = maximum $ map (uncurry max) $ sel cols 137 where cols = columns overlap_score_select (0,fst g) s1 s2 138 sel cs = map last cs ++ last cs 139 -- well - edges have less than three options, so we can set them to zero 140 overlap_score_select cds@[_,_,_] = score_select minf mx g cds 141 overlap_score_select [_] = (0,minf) 142 143 -- | Generic scoring and selection function for global and local scoring 144 score_select :: Double -> QualMx Double -> (Double,Double) -> QSelector (Double,Double) 145 score_select minf mx (go,ge) cds = 146 let (reps,ids) = partition (isRepl.snd') cds 147 s = maximum $ minf:[max sub gap + mx q1 q2 (x,y) | ((sub,gap),Repl x y,q1,q2) <- reps] 148 g = maximum $ minf:[max (sub+go) (gap+ge) | ((sub,gap),_op,_q1,_q2) <- ids] 149 in s `seq` g `seq` (s,g) -- (s,g) 150 -- seq makes local slower, but overlap faster(!?) 151 152 -- ------------------------------------------------------------ 153 -- Alignments (rip from AAlign) 154 155 -- maximum... 156 max' :: (Double,EditList) -> (Double,EditList) -> (Double,EditList) 157 max' (x,ax) (y,yx) = if x>=y then (x,ax) else (y,yx) 158 159 -- ... and addition for compound values 160 fp :: (Double,EditList) -> (Double,Edit) -> (Double,EditList) 161 fp (x,ax) (s,e) = (x+s,e:ax) 162 163 -- | Calculate global alignment (Needleman-Wunsch) 164 global_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList) 165 global_align mx g s1 s2 = revsnd . uncurry max' . last . last 166 $ columns (align_select minf mx g) ((0,[]),(fst g,[])) s1 s2 167 168 -- | Calculate local alignment (Smith-Waterman) 169 -- (can we replace uncurry max' with fst - a local alignment must always end on a subst, no?) 170 local_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double, EditList) 171 local_align mx g s1 s2 = revsnd . maximumBy (compare `on` fst) 172 . map (uncurry max') . concat 173 $ columns (align_select 0 mx g) ((0,[]),(fst g,[])) s1 s2 174 175 -- | Calucalte best overlap score, where gaps at the edges are free 176 -- The starting point is like for local score (0 cost for initial indels), 177 -- the result is the maximum anywhere in the last column or bottom row of the matrix. 178 overlap_align :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList) 179 overlap_align mx g s1 s2 = revsnd . maximumBy (compare `on` fst) . map (uncurry max') $ sel cols 180 where cols = columns overlap_align_select ((0,[]),(minf,[])) s1 s2 181 sel cs = map last cs ++ last cs 182 -- again, the edges have less than three options, so we can set them to zero 183 overlap_align_select cds@[_,_,_] = align_select minf mx g cds 184 overlap_align_select [_] = ((0,[]),(minf,[])) 185 186 -- | Variant that retains indels to retain the entire sequence in the result 187 overlap_align' :: QualMx Double -> (Double,Double) -> Sequence -> Sequence -> (Double,EditList) 188 overlap_align' mx g s1 s2 = revsnd . maximumBy (compare `on` fst) . map (uncurry max') $ sel cols 189 where cols = columns overlap_align_select ((0,[]),(fst g,[])) s1 s2 190 sel cs = map last cs ++ last cs 191 -- the old "fewer choices" trick 192 overlap_align_select cds@[_,_,_] = align_select minf mx g cds 193 overlap_align_select [(((s1,es1),(s2,es2)),e,_,_)] = ((0,e:es2),(0,e:es2)) 194 195 -- (maybe better to reverse the inputs for global?) 196 revsnd (s,a) = (s,reverse a) 197 198 -- | Generic scoring and selection for global and local alignment 199 align_select :: Double -> QualMx Double -> (Double,Double) -> QSelector ((Double,EditList),(Double,EditList)) 200 align_select minf mx (go,ge) cds = 201 let (reps,ids) = partition (isRepl.snd') cds 202 s = maximumBy (compare `on` fst) 203 $ (minf,[]):[max' sub gap `fp` (mx q1 q2 (x,y),e) | ((sub,gap),e@(Repl x y),q1,q2) <- reps] 204 g = maximumBy (compare `on` fst) 205 $ (minf,[]):[max' (sub `fp` (go,e)) (gap `fp` (ge,e)) | ((sub,gap),e,_q1,_q2) <- ids] 206 in (s,g) 207 208 snd' (_,x,_,_) = x 209 210 test :: IO () 211 test = do 212 putStrLn "Enter two strings:" 213 s1' <- getLine 214 s2' <- getLine 215 let s1 = Seq (BC.pack "foo") (BC.pack s1') Nothing 216 s2 = Seq (BC.pack "bar") (BC.pack s2') Nothing 217 mx = qualMx 218 g = (-5,-2) 219 let ga = global_align mx g s1 s2 220 la = local_align mx g s1 s2 221 oa = overlap_align mx g s1 s2 222 or = overlap_align mx g (revcompl s1) (revcompl s2) 223 putStrLn ("GLOBAL: " ++ show (fst ga)) 224 putStrLn $ showalign $ snd ga 225 putStrLn ("OVERLAP: " ++ show (fst oa)) 226 putStrLn $ showalign $ snd oa 227 putStrLn ("OVERLAP (rc): " ++ show (fst or)) 228 putStrLn $ showalign $ snd or 229 putStrLn ("LOCAL: " ++ show (fst la)) 230 putStrLn $ showalign $ snd la