1 {- | 2 Module: Bio.Sequence.Fasta 3 4 This module incorporates functionality for reading and writing 5 sequence data in the Fasta format. 6 Each sequence consists of a header (with a '>' prefix) 7 and a set of lines containing the sequence data. 8 -} 9 10 module Bio.Sequence.Fasta 11 ( 12 -- * Reading and writing plain FASTA files 13 readFasta, writeFasta, hReadFasta, hWriteFasta 14 -- * Reading and writing quality files 15 , readQual, writeQual, hWriteQual 16 -- * Combining FASTA and quality files 17 , readFastaQual, hWriteFastaQual, writeFastaQual 18 -- * Counting sequences in a FASTA file 19 , countSeqs 20 -- * Helper function for reading your own sequences 21 , mkSeqs 22 ) where 23 24 25 -- import Data.Char (isSpace) 26 import Data.List (groupBy,intersperse) 27 import Data.Int 28 import Data.Maybe 29 import System.IO 30 import System.IO.Unsafe 31 import Control.Monad 32 33 import qualified Data.ByteString.Char8 as BS 34 import qualified Data.ByteString.Lazy.Char8 as B 35 import qualified Data.ByteString.Lazy as BB 36 import Data.ByteString.Lazy.Char8 (ByteString) 37 38 import Bio.Sequence.SeqData 39 40 splitsAt :: Offset -> ByteString -> [ByteString] 41 splitsAt n s = let (s1,s2) = B.splitAt n s 42 in if B.null s2 then [s1] else s1 : splitsAt n s2 43 44 {- 45 -- ugly? 46 class SeqType sd where 47 toSeq :: sd -> sd -> Sequence 48 fromSeq :: Sequence -> (sd,sd) 49 50 instance SeqType B.ByteString where 51 toSeq = Seq 52 fromSeq (Seq x y) = (x,y) 53 54 instance SeqType BS.ByteString where 55 toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s]) 56 fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks 57 -} 58 59 -- | Lazily read sequences from a FASTA-formatted file 60 readFasta :: FilePath -> IO [Sequence] 61 readFasta f = do 62 ls <- getLines f 63 return (mkSeqs ls) 64 65 -- | Write sequences to a FASTA-formatted file. 66 -- Line length is 60. 67 writeFasta :: FilePath -> [Sequence] -> IO () 68 writeFasta f ss = do 69 h <- openFile f WriteMode 70 hWriteFasta h ss 71 hClose h 72 73 -- | Read quality data for sequences to a file. 74 readQual :: FilePath -> IO [Sequence] 75 readQual f = do 76 ls <- getLines f 77 return (mkQual ls) 78 79 -- | Write quality data for sequences to a file. 80 writeQual :: FilePath -> [Sequence] -> IO () 81 writeQual f ss = do 82 h <- openFile f WriteMode 83 hWriteQual h ss 84 hClose h 85 86 -- | Read sequence and associated quality. Will error if 87 -- the sequences and qualites do not match one-to-one in sequence. 88 readFastaQual :: FilePath -> FilePath -> IO [Sequence] 89 readFastaQual s q = do 90 ss <- readFasta s 91 qs <- readQual q 92 return ss 93 -- warning: assumes correct qual file! 94 let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z)) 95 | seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: " 96 ++toStr (seqlabel s1)++","++toStr (seqlabel s2)) 97 | B.length y /= B.length z = error ("mismatching sequence and quality lengths:" 98 ++ toStr (seqlabel s1)++","++show (B.length y,B.length z)) 99 | otherwise = Seq x y (Just z) 100 mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information" 101 return $ zipWith mkseq ss qs 102 103 -- | Write sequence and quality data simulatnously 104 -- This may be more laziness-friendly. 105 writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO () 106 writeFastaQual f q ss = do 107 hf <- openFile f WriteMode 108 hq <- openFile q WriteMode 109 hWriteFastaQual hf hq ss 110 hClose hq 111 hClose hf 112 113 hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO () 114 hWriteFastaQual hf hq = mapM_ wFQ 115 where wFQ s = wFasta hf s >> wQual hq s 116 117 -- | Lazily read sequence from handle 118 hReadFasta :: Handle -> IO [Sequence] 119 hReadFasta h = do 120 ls <- hGetLines h 121 return (mkSeqs ls) 122 123 -- | Write sequences in FASTA format to a handle. 124 hWriteFasta :: Handle -> [Sequence] -> IO () 125 hWriteFasta h = mapM_ (wFasta h) 126 127 wHead :: Handle -> SeqData -> IO () 128 wHead h l = do 129 B.hPut h $ B.pack ">" 130 B.hPut h l 131 B.hPut h $ B.pack "\n" 132 133 wFasta :: Handle -> Sequence -> IO () 134 wFasta h (Seq l d _) = do 135 wHead h l 136 let ls = splitsAt 60 d 137 mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls 138 B.hPut h $ B.pack "\n" 139 140 hWriteQual :: Handle -> [Sequence] -> IO () 141 hWriteQual h = mapM_ (wQual h) 142 143 wQual :: Handle -> Sequence -> IO () 144 wQual h (Seq l _ (Just q)) = do 145 wHead h l 146 let qls = splitsAt 20 q 147 qs = B.pack . unwords . map show . BB.unpack 148 mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls 149 wQual _ (Seq _ _ Nothing) = return () 150 151 -- ByteString operations 152 -- These aren't (or possible weren't) provided by the FPS library. 153 -- Implement line-based IO (in retrospect it'd be simpler 154 -- and better to just use 'lines'.) 155 156 -- lazily read lines from file 157 getLines :: FilePath -> IO [ByteString] 158 getLines f = do 159 h <- openFile f ReadMode 160 hGetLines' (hClose h) h 161 162 -- lazily read lines from handle 163 hGetLines :: Handle -> IO [ByteString] 164 hGetLines = hGetLines' (return ()) 165 166 -- add an optional handle-closing parameter 167 hGetLines' :: IO () -> Handle -> IO [ByteString] 168 hGetLines' c h = do 169 e <- hIsEOF h 170 if e then c >> return [] 171 else do l' <- BS.hGetLine h 172 let l = B.fromChunks $ if BS.null l' then [] else [l'] 173 ls <- unsafeInterleaveIO $ hGetLines' c h 174 return (l:ls) 175 176 -- | Convert a list of FASTA-formatted lines into a list of sequences. 177 -- Blank lines are ignored. 178 -- Comment lines start with "#" are allowed between sequences (and ignored). 179 -- Lines starting with ">" initiate a new sequence. 180 mkSeqs :: [ByteString] -> [Sequence] 181 mkSeqs = map mkSeq . blocks 182 183 mkSeq :: [ByteString] -> Sequence 184 mkSeq (l:ls) = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing 185 where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s 186 mkSeq [] = error "empty input to mkSeq" 187 188 mkQual :: [ByteString] -> [Sequence] 189 mkQual = map f . blocks 190 where f (l:ls) = Seq (B.drop 1 l) B.empty 191 (Just $ BB.pack (map int (B.words $ B.unlines ls))) 192 f [] = error "mkQual: empty quality data" 193 int = fromIntegral . fst . fromJust' . B.readInt 194 fromJust' = maybe (error "Error in qual format") id 195 196 -- | Split lines into blocks starting with '>' characters 197 -- Filter out # comments (but not semicolons?) 198 blocks :: [ByteString] -> [[ByteString]] 199 blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . filter (not . B.null) 200 201 countSeqs :: FilePath -> IO Int 202 countSeqs f = do 203 ss <- B.readFile f 204 let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss 205 return (length hdrs) 206