1 {- | Data structures and helper functions for calculating alignments
    2 
    3    There are two ways to view an alignment: either as a list of edits
    4    (i.e., insertions, deletions, or substitutions), or as a set of sequences
    5    with inserted gaps.
    6 
    7    The edit list approach is perhaps more restrictive model but doesn't generalize
    8    to multiple alignments.
    9 
   10    The gap approach is more general, and probably more commonly used by other software
   11    (see e.g. the ACE file format).
   12 
   13 -}
   14 
   15 {-# OPTIONS -fglasgow-exts #-}
   16 
   17 module Bio.Alignment.AlignData (
   18     -- * Data types for gap-based alignemnts
   19     Dir(..), Gaps, Alignment
   20     -- * Helper functions
   21     , extractGaps, insertGaps
   22     -- * Data types for edit-based alignments
   23     , Edit(..), EditList, SubstMx, Selector, Chr
   24     -- * Helper functions
   25     , columns, eval, isRepl, on
   26     , showalign, toStrings
   27     ) where
   28 
   29 import qualified Data.ByteString.Lazy as B
   30 import qualified Data.ByteString.Lazy.Char8 as BC
   31 import Bio.Sequence.SeqData
   32 import Data.List (unfoldr)
   33 import Data.Word
   34 import Data.Char (chr)
   35 
   36 -- ----------------------------------------
   37 
   38 data Dir = Fwd | Rev deriving (Eq,Show)
   39 type Gaps = [Offset]
   40 type Alignment = [(Offset,Dir,Sequence,Gaps)]
   41 
   42 -- | Gaps are coded as '*'s, this function removes them, and returns
   43 --   the sequence along with the list of gap positions.
   44 extractGaps :: SeqData -> (SeqData,Gaps)
   45 extractGaps str = (BC.filter (/='*') str,BC.elemIndices '*' str)
   46 
   47 -- todo: faster to lift concat out of the inner loop?
   48 insertGaps :: Char -> (SeqData,Gaps) -> SeqData
   49 insertGaps c (str',gaps) = go str' B.empty 0 gaps
   50     where go str acc p (next:rest) = let (a,b) = BC.splitAt (next-p) str
   51                                      in go b (BC.concat [acc,a,BC.pack [c]]) (next+1) rest
   52           go str acc _ [] = BC.append acc str
   53 
   54 -- ----------------------------------------
   55 
   56 -- Q&D helper function
   57 showalign a = let (s1,s2) = toStrings a in s1++"\n"++s2
   58 
   59 -- | turn an alignment into sequences with '-' representing gaps
   60 -- (for checking, filtering out the '-' characters should return
   61 -- the original sequences, provided '-' isn't part of the sequence
   62 -- alphabet)
   63 toStrings :: EditList -> (String,String)
   64 toStrings [] = ("","")
   65 toStrings (x:xs) = let (a1',a2') = toStrings xs
   66                        chr' = chr . fromIntegral
   67                    in case x of Ins c -> ('-':a1', chr' c:a2')
   68                                 Del c -> (chr' c:a1', '-':a2')
   69                                 Repl c1 c2 -> (chr' c1:a1', chr' c2:a2')
   70 
   71 -- | The sequence element type, used in alignments.
   72 type Chr = Word8
   73 
   74 -- | An Edit is either the insertion, the deletion,
   75 --   or the replacement of a character.
   76 data Edit = Ins Chr | Del Chr | Repl Chr Chr deriving (Show,Eq)
   77 
   78 -- | An alignment is a sequence of edits.
   79 type EditList = [Edit]
   80 
   81 -- | True if the Edit is a Repl.
   82 isRepl :: Edit -> Bool
   83 isRepl (Repl _ _) = True
   84 isRepl _ = False
   85 
   86 -- | A substitution matrix gives scores for replacing a character with another.
   87 --   Typically, it will be symmetric.
   88 type SubstMx a = (Chr,Chr) -> a
   89 
   90 -- | Evaluate an Edit based on SubstMx and gap penalty
   91 eval :: SubstMx a -> a -> Edit -> a
   92 eval mx g c = case c of Ins _ -> g; Del _ -> g; Repl x y -> mx (x,y)
   93 
   94 -- | A Selector consists of a zero element, and a funcition
   95 --   that chooses a possible Edit operation, and generates an updated result.
   96 type Selector a = [(a,Edit)] -> a
   97 
   98 -- ------------------------------------------------------------
   99 -- | Calculate a set of columns containing scores
  100 --   This represents the columns of the alignment matrix, but will only require linear space
  101 --   for score calculation.
  102 columns :: Selector a -> a -> Sequence -> Sequence -> [[a]]
  103 columns f z (Seq _ s1 _) (Seq _ s2 _) = columns' f z s1 s2
  104 
  105 columns' :: Selector a -> a -> SeqData -> SeqData -> [[a]]
  106 columns' f zero s1 s2 = let
  107         -- the first column consists of increasing numbers of insertions
  108         c0 = zero : map (f.return) (zip c0 (map Ins (B.unpack s2)))
  109         -- given the previous column, and the remains of s2, calculate the next column
  110         mkcol (p0:prev,x) = if B.null x then Nothing
  111                             else let xi = B.head x
  112                                      ys = B.unpack s2
  113                                      c  = f [(p0,Del xi)] : [f [del,ins,rep] | del <- zip prev $ repeat (Del xi)
  114                                                                              | ins <- zip c $ map Ins ys
  115                                                                              | rep <- zip (p0:prev) $ map (Repl xi) ys]
  116                                  in Just (c,(c,B.tail x))
  117     in c0 : unfoldr mkcol (c0,s1)
  118 
  119 on c f x y = c (f x) (f y)