1 {- | 2 Data structures for manipulating (biological) sequences. 3 4 Generally supports both nucleotide and protein sequences, some functions, 5 like @revcompl@, only makes sense for nucleotides. 6 -} 7 8 module Bio.Sequence.SeqData 9 ( 10 -- * Data structure 11 -- | A sequence is a header, sequence data itself, and optional quality data. 12 -- All items are lazy bytestrings. The Offset type can be used for indexing. 13 Sequence(..), Offset, SeqData, 14 15 -- | Quality data is normally associated with nucleotide sequences 16 Qual, QualData 17 18 -- * Accessor functions 19 , (!), seqlength,seqlabel,seqheader,seqdata 20 , (?), hasqual, seqqual 21 22 -- * Adding information to header 23 , appendHeader, setHeader 24 25 -- * Converting to and from [Char] 26 , fromStr, toStr 27 28 -- * Nucleotide functionality 29 -- | Nucleotide sequences contain the alphabet [A,C,G,T]. 30 -- IUPAC specifies an extended nucleotide alphabet with wildcards, but 31 -- it is not supported at this point. 32 , compl, revcompl 33 34 -- * Protein functionality 35 -- | Proteins are chains of amino acids, represented by the IUPAC alphabet. 36 , Amino(..), translate, fromIUPAC, toIUPAC -- amino acids 37 ) where 38 39 import Data.List (unfoldr) 40 import Data.Char (toUpper) 41 import Data.Int 42 import Data.Word 43 import Data.Maybe (fromJust, isJust) 44 import qualified Data.ByteString.Lazy.Char8 as B 45 import qualified Data.ByteString.Lazy as BB 46 47 48 -- | An offset, index, or length of a 'SeqData' 49 type Offset = Int64 50 51 -- | The basic data type used in 'Sequence's 52 type SeqData = B.ByteString 53 54 -- | Basic type for quality data. Range 0..255. Typical Phred output is in 55 -- the range 6..50, with 20 as the line in the sand separating good from bad. 56 type Qual = Word8 57 58 -- | Quality data is a 'Qual' vector, currently implemented as a 'ByteString'. 59 type QualData = BB.ByteString -- mild abuse 60 61 -- | A sequence consists of a header, the sequence data itself, and optional quality data. 62 data Sequence = Seq !SeqData !SeqData !(Maybe QualData) -- ^ header and actual sequence 63 deriving (Show,Eq) 64 65 -- | Convert a String to 'SeqData' 66 fromStr :: String -> SeqData 67 fromStr = B.pack 68 69 -- | Convert a 'SeqData' to a String 70 toStr :: SeqData -> String 71 toStr = B.unpack 72 73 -- | Read the character at the specified position in the sequence. 74 {-# INLINE (!) #-} 75 (!) :: Sequence -> Offset -> Char 76 (!) (Seq _ bs _) = B.index bs 77 78 {-# INLINE (?) #-} 79 (?) :: Sequence -> Offset -> Qual 80 (?) (Seq _ _ (Just qs)) = BB.index qs 81 (?) (Seq _ _ Nothing) = 82 error "Attempting to use 'seqqual' on sequence without associated quality data" 83 84 -- | Return sequence length. 85 seqlength :: Sequence -> Offset 86 seqlength (Seq _ bs _) = B.length bs 87 88 -- | Return sequence label (first word of header) 89 seqlabel :: Sequence -> SeqData 90 seqlabel (Seq l _ _) = head (B.words l) 91 92 -- | Return full header. 93 seqheader :: Sequence -> SeqData 94 seqheader (Seq l _ _) = l 95 96 -- | Return the sequence data. 97 seqdata :: Sequence -> SeqData 98 seqdata (Seq _ d _) = d 99 100 -- | Return the quality data, or error if none exist. Use hasqual if in doubt. 101 seqqual :: Sequence -> QualData 102 seqqual (Seq _ _ (Just q)) = q 103 seqqual (Seq _ _ Nothing) = 104 error "Attempting to use 'seqqual' on sequence without associated quality data" 105 106 -- | Check whether the sequence has associated quality data. 107 hasqual :: Sequence -> Bool 108 hasqual (Seq _ _ q) = isJust q 109 110 -- | Modify the header by appending text, or by replacing 111 -- all but the sequence label (i.e. first word). 112 appendHeader, setHeader :: Sequence -> String -> Sequence 113 appendHeader (Seq h d q) t = (Seq (B.append h (B.pack (" "++t))) d q) 114 setHeader (Seq h d q) t = (Seq (B.append (head $ B.words h) (B.pack (" "++t))) d q) 115 116 ------------------------------------------------------------ 117 -- Nucleotide (DNA, RNA) specific stuff 118 ------------------------------------------------------------ 119 120 -- | Calculate the reverse complement. 121 -- This is only relevant for the nucleotide alphabet, 122 -- and it leaves other characters unmodified. 123 revcompl :: Sequence -> Sequence 124 revcompl (Seq l bs qs) = Seq l (B.reverse $ B.map compl bs) 125 (maybe Nothing (Just . BB.reverse) qs) 126 127 -- | Complement a single character. I.e. identify the nucleotide it 128 -- can hybridize with. Note that for multiple nucleotides, you usually 129 -- want the reverse complement (see 'revcompl' for that). 130 compl :: Char -> Char 131 compl 'A' = 'T' 132 compl 'T' = 'A' 133 compl 'C' = 'G' 134 compl 'G' = 'C' 135 compl 'a' = 't' 136 compl 't' = 'a' 137 compl 'c' = 'g' 138 compl 'g' = 'c' 139 compl x = x 140 141 ------------------------------------------------------------ 142 -- Amino acid (protein) stuff 143 ------------------------------------------------------------ 144 145 -- | Translate a nucleotide sequence into the corresponding protein 146 -- sequence. This works rather blindly, with no attempt to identify ORFs 147 -- or otherwise QA the result. 148 translate :: Sequence -> Offset -> [Amino] 149 translate s' o' = unfoldr triples (s',o') 150 where triples (s,o) = 151 if o > seqlength s - 3 then Nothing 152 else Just (trans1 (map (s!) [o,o+1,o+2]),(s,o+3)) 153 154 -- prop_trans s = tail (translate s 0) == translate s 3 155 156 trans1 :: String -> Amino 157 trans1 = maybe Xaa id . flip lookup trans_tbl . map (repUT . toUpper) 158 where repUT x = if x == 'U' then 'T' else x -- RNA uses U for T 159 160 data Amino = Ala | Arg | Asn | Asp | Cys | Gln | Glu | Gly 161 | His | Ile | Leu | Lys | Met | Phe | Pro | Ser 162 | Thr | Tyr | Trp | Val 163 | STP | Asx | Glx | Xle | Xaa -- unknowns 164 deriving (Show,Eq) 165 166 trans_tbl :: [(String,Amino)] 167 trans_tbl = [("GCT",Ala),("GCC",Ala),("GCA",Ala),("GCG",Ala), 168 ("CGT",Arg),("CGC",Arg),("CGA",Arg),("CGG",Arg),("AGA",Arg),("AGG",Arg), 169 ("AAT",Asn),("AAC",Asn), 170 ("GAT",Asp),("GAC",Asp), 171 -- ("RAT",Asx),("RAC",Asx), -- IUPAC: R is purine (A or G) 172 ("TGT",Cys),("TGC",Cys), 173 ("CAA",Gln),("CAG",Gln), 174 ("GAA",Glu),("GAG",Glu), 175 -- ("SAA",Glx),("SAG",Glx), -- IUPAC: S is C or G 176 ("GGT",Gly),("GGC",Gly),("GGA",Gly),("GGG",Gly), 177 ("CAT",His),("CAC",His), 178 ("ATT",Ile),("ATC",Ile),("ATA",Ile), 179 ("TTA",Leu),("TTG",Leu),("CTT",Leu),("CTC",Leu),("CTA",Leu),("CTG",Leu), 180 ("AAA",Lys),("AAG",Lys), 181 ("ATG",Met), 182 ("TTT",Phe),("TTC",Phe), 183 ("CCT",Pro),("CCC",Pro),("CCA",Pro),("CCG",Pro), 184 ("TCT",Ser),("TCC",Ser),("TCA",Ser),("TCG",Ser),("AGT",Ser),("AGC",Ser), 185 ("ACT",Thr),("ACC",Thr),("ACA",Thr),("ACG",Thr), 186 ("TAT",Tyr),("TAC",Tyr), 187 ("TGG",Trp), 188 ("GTT",Val),("GTC",Val),("GTA",Val),("GTG",Val), 189 ("TAG",STP),("TGA",STP),("TAA",STP) 190 ] 191 -- todo: extend with more IUPAC nucleotide wildcards? 192 193 -- | Convert a list of amino acids to a sequence in IUPAC format. 194 toIUPAC :: [Amino] -> SeqData 195 toIUPAC = B.pack . map (fromJust . flip lookup iupac) 196 197 -- | Convert a sequence in IUPAC format to a list of amino acids. 198 fromIUPAC :: SeqData -> [Amino] 199 fromIUPAC = map (maybe Xaa id . flip lookup iupac') . B.unpack 200 201 iupac :: [(Amino,Char)] 202 iupac = [(Ala,'A') ,(Arg,'R') ,(Asn,'N') 203 ,(Asp,'D') ,(Cys,'C') ,(Gln,'Q') 204 ,(Glu,'E') ,(Gly,'G') ,(His,'H') 205 ,(Ile,'I') ,(Leu,'L') ,(Lys,'K') 206 ,(Met,'M') ,(Phe,'F') ,(Pro,'P') 207 ,(Ser,'S') ,(Thr,'T') ,(Tyr,'Y') 208 ,(Trp,'W') ,(Val,'V') 209 ,(Asx,'B') -- Asn or Asp 210 ,(Glx,'Z') -- Gln or Glu 211 ,(Xle,'J') -- Ile or Leu 212 ,(STP,'X') ,(Xaa,'X') 213 ] 214 iupac' :: [(Char,Amino)] 215 iupac' = map (\(a,b)->(b,a)) iupac