1 {- |
    2    Data structures for manipulating (biological) sequences.
    3 
    4    Generally supports both nucleotide and protein sequences, some functions,
    5    like @revcompl@, only makes sense for nucleotides.
    6 -}
    7 
    8 module Bio.Sequence.SeqData 
    9     (
   10       -- * Data structure
   11       -- | A sequence is a header, sequence data itself, and optional quality data.
   12       --   All items are lazy bytestrings.  The Offset type can be used for indexing.
   13       Sequence(..), Offset, SeqData,
   14 
   15       -- | Quality data is normally associated with nucleotide sequences
   16       Qual, QualData
   17 
   18       -- * Accessor functions
   19     , (!), seqlength,seqlabel,seqheader,seqdata
   20     , (?), hasqual, seqqual
   21 
   22       -- * Adding information to header
   23     , appendHeader, setHeader
   24 
   25       -- * Converting to and from [Char]
   26     , fromStr, toStr
   27 
   28       -- * Nucleotide functionality
   29       -- | Nucleotide sequences contain the alphabet [A,C,G,T].
   30       -- IUPAC specifies an extended nucleotide alphabet with wildcards, but
   31       -- it is not supported at this point.
   32     , compl, revcompl
   33 
   34       -- * Protein functionality
   35       -- | Proteins are chains of amino acids, represented by the IUPAC alphabet.
   36     , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids
   37     ) where
   38 
   39 import Data.List (unfoldr)
   40 import Data.Char (toUpper)
   41 import Data.Int
   42 import Data.Word
   43 import Data.Maybe (fromJust, isJust)
   44 import qualified Data.ByteString.Lazy.Char8 as B
   45 import qualified Data.ByteString.Lazy as BB
   46 
   47 
   48 -- | An offset, index, or length of a 'SeqData'
   49 type Offset  = Int64
   50 
   51 -- | The basic data type used in 'Sequence's
   52 type SeqData = B.ByteString
   53 
   54 -- | Basic type for quality data.  Range 0..255.  Typical Phred output is in
   55 --   the range 6..50, with 20 as the line in the sand separating good from bad.
   56 type Qual = Word8
   57 
   58 -- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'.
   59 type QualData = BB.ByteString -- mild abuse
   60 
   61 -- | A sequence consists of a header, the sequence data itself, and optional quality data.
   62 data Sequence = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence
   63                 deriving (Show,Eq)
   64 
   65 -- | Convert a String to 'SeqData'
   66 fromStr :: String -> SeqData
   67 fromStr = B.pack
   68 
   69 -- | Convert a 'SeqData' to a String
   70 toStr :: SeqData -> String
   71 toStr   = B.unpack
   72 
   73 -- | Read the character at the specified position in the sequence.
   74 {-# INLINE (!) #-}
   75 (!) :: Sequence -> Offset -> Char
   76 (!) (Seq _ bs _) = B.index bs
   77 
   78 {-# INLINE (?) #-}
   79 (?) :: Sequence -> Offset -> Qual
   80 (?) (Seq _ _ (Just qs)) = BB.index qs
   81 (?) (Seq _ _ Nothing) = 
   82     error "Attempting to use 'seqqual' on sequence without associated quality data"
   83 
   84 -- | Return sequence length.
   85 seqlength :: Sequence -> Offset
   86 seqlength (Seq _ bs _) = B.length bs
   87 
   88 -- | Return sequence label (first word of header)
   89 seqlabel :: Sequence -> SeqData
   90 seqlabel (Seq l _ _) = head (B.words l)
   91 
   92 -- | Return full header.
   93 seqheader :: Sequence -> SeqData
   94 seqheader (Seq l _ _) = l
   95 
   96 -- | Return the sequence data.
   97 seqdata :: Sequence -> SeqData
   98 seqdata (Seq _ d _) = d
   99 
  100 -- | Return the quality data, or error if none exist.  Use hasqual if in doubt.
  101 seqqual :: Sequence -> QualData
  102 seqqual (Seq _ _ (Just q)) = q
  103 seqqual (Seq _ _ Nothing) = 
  104     error "Attempting to use 'seqqual' on sequence without associated quality data"
  105 
  106 -- | Check whether the sequence has associated quality data.
  107 hasqual :: Sequence -> Bool
  108 hasqual (Seq _ _ q) = isJust q
  109 
  110 -- | Modify the header by appending text, or by replacing
  111 --   all but the sequence label (i.e. first word).
  112 appendHeader, setHeader :: Sequence -> String -> Sequence
  113 appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q)
  114 setHeader (Seq h d q) t = (Seq (B.append (head $ B.words h) (B.pack (" "++t))) d q)
  115 
  116 ------------------------------------------------------------
  117 -- Nucleotide (DNA, RNA) specific stuff
  118 ------------------------------------------------------------
  119 
  120 -- | Calculate the reverse complement.
  121 --   This is only relevant for the nucleotide alphabet, 
  122 --   and it leaves other characters unmodified.  
  123 revcompl :: Sequence -> Sequence
  124 revcompl (Seq l bs qs) = Seq l (B.reverse $ B.map compl bs)
  125                         (maybe Nothing (Just . BB.reverse) qs)
  126 
  127 -- | Complement a single character.  I.e. identify the nucleotide it 
  128 --   can hybridize with.  Note that for multiple nucleotides, you usually
  129 --   want the reverse complement (see 'revcompl' for that).
  130 compl :: Char -> Char
  131 compl 'A' = 'T'
  132 compl 'T' = 'A'
  133 compl 'C' = 'G'
  134 compl 'G' = 'C'
  135 compl 'a' = 't'
  136 compl 't' = 'a'
  137 compl 'c' = 'g'
  138 compl 'g' = 'c'
  139 compl  x  =  x
  140 
  141 ------------------------------------------------------------
  142 -- Amino acid (protein) stuff
  143 ------------------------------------------------------------
  144 
  145 -- | Translate a nucleotide sequence into the corresponding protein
  146 --   sequence.  This works rather blindly, with no attempt to identify ORFs
  147 --   or otherwise QA the result.
  148 translate :: Sequence -> Offset -> [Amino]
  149 translate s' o' = unfoldr triples (s',o')
  150    where triples (s,o) = 
  151              if o > seqlength s - 3 then Nothing
  152              else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3))
  153 
  154 -- prop_trans s = tail (translate s 0) == translate s 3
  155 
  156 trans1 :: String -> Amino
  157 trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper)
  158     where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T
  159 
  160 data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly
  161            | His | Ile | Leu | Lys | Met | Phe | Pro | Ser
  162            | Thr | Tyr | Trp | Val 
  163            | STP | Asx | Glx | Xle | Xaa -- unknowns
  164      deriving (Show,Eq)
  165 
  166 trans_tbl :: [(String,Amino)]
  167 trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala),
  168              ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg),
  169              ("AAT",Asn),("AAC",Asn),
  170              ("GAT",Asp),("GAC",Asp),
  171 --             ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G)
  172              ("TGT",Cys),("TGC",Cys),
  173              ("CAA",Gln),("CAG",Gln),
  174              ("GAA",Glu),("GAG",Glu),
  175 --             ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G
  176              ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly),
  177              ("CAT",His),("CAC",His),
  178              ("ATT",Ile),("ATC",Ile),("ATA",Ile),
  179              ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu),
  180              ("AAA",Lys),("AAG",Lys),
  181              ("ATG",Met),
  182              ("TTT",Phe),("TTC",Phe),
  183              ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro),
  184              ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser),
  185              ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr),
  186              ("TAT",Tyr),("TAC",Tyr),
  187              ("TGG",Trp),
  188              ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val),
  189              ("TAG",STP),("TGA",STP),("TAA",STP)
  190             ]
  191 -- todo: extend with more IUPAC nucleotide wildcards?
  192 
  193 -- | Convert a list of amino acids to a sequence in IUPAC format.
  194 toIUPAC :: [Amino] -> SeqData
  195 toIUPAC = B.pack . map (fromJust . flip lookup iupac)
  196 
  197 -- | Convert a sequence in IUPAC format to a list of amino acids.
  198 fromIUPAC :: SeqData -> [Amino]
  199 fromIUPAC = map (maybe Xaa id . flip lookup iupac') . B.unpack
  200 
  201 iupac :: [(Amino,Char)]
  202 iupac = [(Ala,'A')        ,(Arg,'R')        ,(Asn,'N')
  203         ,(Asp,'D')        ,(Cys,'C')        ,(Gln,'Q')
  204         ,(Glu,'E')        ,(Gly,'G')        ,(His,'H')
  205         ,(Ile,'I')        ,(Leu,'L')        ,(Lys,'K')
  206         ,(Met,'M')        ,(Phe,'F')        ,(Pro,'P')
  207         ,(Ser,'S')        ,(Thr,'T')        ,(Tyr,'Y')
  208         ,(Trp,'W')        ,(Val,'V')
  209         ,(Asx,'B') -- Asn or Asp
  210         ,(Glx,'Z') -- Gln or Glu
  211         ,(Xle,'J') -- Ile or Leu
  212         ,(STP,'X')        ,(Xaa,'X')
  213         ]
  214 iupac' :: [(Char,Amino)]
  215 iupac' = map (\(a,b)->(b,a)) iupac