1 {- | Data structures and helper functions for calculating alignments 2 3 There are two ways to view an alignment: either as a list of edits 4 (i.e., insertions, deletions, or substitutions), or as a set of sequences 5 with inserted gaps. 6 7 The edit list approach is perhaps more restrictive model but doesn't generalize 8 to multiple alignments. 9 10 The gap approach is more general, and probably more commonly used by other software 11 (see e.g. the ACE file format). 12 13 -} 14 15 {-# OPTIONS -fglasgow-exts #-} 16 17 module Bio.Alignment.AlignData ( 18 -- * Data types for gap-based alignemnts 19 Dir(..), Gaps, Alignment 20 -- * Helper functions 21 , extractGaps, insertGaps 22 -- * Data types for edit-based alignments 23 , Edit(..), EditList, SubstMx, Selector, Chr 24 -- * Helper functions 25 , columns, eval, isRepl, on 26 , showalign, toStrings 27 ) where 28 29 import qualified Data.ByteString.Lazy as B 30 import qualified Data.ByteString.Lazy.Char8 as BC 31 import Bio.Sequence.SeqData 32 import Data.List (unfoldr) 33 import Data.Word 34 import Data.Char (chr) 35 36 -- ---------------------------------------- 37 38 data Dir = Fwd | Rev deriving (Eq,Show) 39 type Gaps = [Offset] 40 type Alignment = [(Offset,Dir,Sequence,Gaps)] 41 42 -- | Gaps are coded as '*'s, this function removes them, and returns 43 -- the sequence along with the list of gap positions. 44 extractGaps :: SeqData -> (SeqData,Gaps) 45 extractGaps str = (BC.filter (/='*') str,BC.elemIndices '*' str) 46 47 -- todo: faster to lift concat out of the inner loop? 48 insertGaps :: Char -> (SeqData,Gaps) -> SeqData 49 insertGaps c (str',gaps) = go str' B.empty 0 gaps 50 where go str acc p (next:rest) = let (a,b) = BC.splitAt (next-p) str 51 in go b (BC.concat [acc,a,BC.pack [c]]) (next+1) rest 52 go str acc _ [] = BC.append acc str 53 54 -- ---------------------------------------- 55 56 -- Q&D helper function 57 showalign a = let (s1,s2) = toStrings a in s1++"\n"++s2 58 59 -- | turn an alignment into sequences with '-' representing gaps 60 -- (for checking, filtering out the '-' characters should return 61 -- the original sequences, provided '-' isn't part of the sequence 62 -- alphabet) 63 toStrings :: EditList -> (String,String) 64 toStrings [] = ("","") 65 toStrings (x:xs) = let (a1',a2') = toStrings xs 66 chr' = chr . fromIntegral 67 in case x of Ins c -> ('-':a1', chr' c:a2') 68 Del c -> (chr' c:a1', '-':a2') 69 Repl c1 c2 -> (chr' c1:a1', chr' c2:a2') 70 71 -- | The sequence element type, used in alignments. 72 type Chr = Word8 73 74 -- | An Edit is either the insertion, the deletion, 75 -- or the replacement of a character. 76 data Edit = Ins Chr | Del Chr | Repl Chr Chr deriving (Show,Eq) 77 78 -- | An alignment is a sequence of edits. 79 type EditList = [Edit] 80 81 -- | True if the Edit is a Repl. 82 isRepl :: Edit -> Bool 83 isRepl (Repl _ _) = True 84 isRepl _ = False 85 86 -- | A substitution matrix gives scores for replacing a character with another. 87 -- Typically, it will be symmetric. 88 type SubstMx a = (Chr,Chr) -> a 89 90 -- | Evaluate an Edit based on SubstMx and gap penalty 91 eval :: SubstMx a -> a -> Edit -> a 92 eval mx g c = case c of Ins _ -> g; Del _ -> g; Repl x y -> mx (x,y) 93 94 -- | A Selector consists of a zero element, and a funcition 95 -- that chooses a possible Edit operation, and generates an updated result. 96 type Selector a = [(a,Edit)] -> a 97 98 -- ------------------------------------------------------------ 99 -- | Calculate a set of columns containing scores 100 -- This represents the columns of the alignment matrix, but will only require linear space 101 -- for score calculation. 102 columns :: Selector a -> a -> Sequence -> Sequence -> [[a]] 103 columns f z (Seq _ s1 _) (Seq _ s2 _) = columns' f z s1 s2 104 105 columns' :: Selector a -> a -> SeqData -> SeqData -> [[a]] 106 columns' f zero s1 s2 = let 107 -- the first column consists of increasing numbers of insertions 108 c0 = zero : map (f.return) (zip c0 (map Ins (B.unpack s2))) 109 -- given the previous column, and the remains of s2, calculate the next column 110 mkcol (p0:prev,x) = if B.null x then Nothing 111 else let xi = B.head x 112 ys = B.unpack s2 113 c = f [(p0,Del xi)] : [f [del,ins,rep] | del <- zip prev $ repeat (Del xi) 114 | ins <- zip c $ map Ins ys 115 | rep <- zip (p0:prev) $ map (Repl xi) ys] 116 in Just (c,(c,B.tail x)) 117 in c0 : unfoldr mkcol (c0,s1) 118 119 on c f x y = c (f x) (f y)