1 {- |
    2    Module: Bio.Sequence.Fasta
    3 
    4    This module incorporates functionality for reading and writing
    5    sequence data in the Fasta format.
    6    Each sequence consists of a header (with a '>' prefix)
    7    and a set of lines containing the sequence data.
    8 -}
    9 
   10 module Bio.Sequence.Fasta
   11     (
   12     -- * Reading and writing plain FASTA files
   13     readFasta, writeFasta, hReadFasta, hWriteFasta
   14     -- * Reading and writing quality files
   15     , readQual, writeQual, hWriteQual
   16     -- * Combining FASTA and quality files
   17     , readFastaQual, hWriteFastaQual, writeFastaQual
   18     -- * Counting sequences in a FASTA file
   19     , countSeqs
   20     -- * Helper function for reading your own sequences
   21     , mkSeqs
   22 ) where
   23 
   24 
   25 -- import Data.Char (isSpace)
   26 import Data.List (groupBy,intersperse)
   27 import Data.Int
   28 import Data.Maybe
   29 import System.IO
   30 import System.IO.Unsafe
   31 import Control.Monad
   32 
   33 import qualified Data.ByteString.Char8 as BS
   34 import qualified Data.ByteString.Lazy.Char8 as B
   35 import qualified Data.ByteString.Lazy as BB
   36 import Data.ByteString.Lazy.Char8 (ByteString)
   37 
   38 import Bio.Sequence.SeqData
   39 
   40 splitsAt :: Offset -> ByteString -> [ByteString]
   41 splitsAt n s = let (s1,s2) = B.splitAt n s
   42                in if B.null s2 then [s1] else s1 : splitsAt n s2
   43 
   44 {-
   45 -- ugly?
   46 class SeqType sd where
   47     toSeq :: sd -> sd -> Sequence
   48     fromSeq :: Sequence -> (sd,sd)
   49 
   50 instance SeqType B.ByteString where
   51     toSeq = Seq
   52     fromSeq (Seq x y) = (x,y)
   53 
   54 instance SeqType BS.ByteString where
   55     toSeq h s = Seq (B.fromChunks [h]) (B.fromChunks [s])
   56     fromSeq (Seq x y) = (c x, c y) where c = BS.concat . B.toChunks
   57 -}
   58 
   59 -- | Lazily read sequences from a FASTA-formatted file
   60 readFasta :: FilePath -> IO [Sequence]
   61 readFasta f = do
   62   ls <- getLines f
   63   return (mkSeqs ls)
   64 
   65 -- | Write sequences to a FASTA-formatted file.
   66 --   Line length is 60.
   67 writeFasta :: FilePath -> [Sequence] -> IO ()
   68 writeFasta f ss = do
   69   h <- openFile f WriteMode
   70   hWriteFasta h ss
   71   hClose h
   72 
   73 -- | Read quality data for sequences to a file.
   74 readQual :: FilePath -> IO [Sequence]
   75 readQual f = do
   76   ls <- getLines f
   77   return (mkQual ls)
   78 
   79 -- | Write quality data for sequences to a file.
   80 writeQual :: FilePath -> [Sequence] -> IO ()
   81 writeQual f ss = do
   82   h <- openFile f WriteMode
   83   hWriteQual h ss
   84   hClose h
   85 
   86 -- | Read sequence and associated quality.  Will error if
   87 --   the sequences and qualites do not match one-to-one in sequence.
   88 readFastaQual :: FilePath -> FilePath -> IO [Sequence]
   89 readFastaQual  s q = do
   90   ss <- readFasta s
   91   qs <- readQual q
   92   return ss
   93   -- warning: assumes correct qual file!
   94   let mkseq s1@(Seq x y _) s2@(Seq _ _ (Just z))
   95           | seqlabel s1 /= seqlabel s2 = error ("mismatching sequence and quality: "
   96                                                 ++toStr (seqlabel s1)++","++toStr (seqlabel s2))
   97           | B.length y /= B.length z   = error ("mismatching sequence and quality lengths:"
   98                                                 ++ toStr (seqlabel s1)++","++show (B.length y,B.length z))
   99           | otherwise   = Seq x y (Just z)
  100       mkseq _ _ = error "readFastaQual: could not combine Fasta and Qual information"
  101   return $ zipWith mkseq ss qs
  102 
  103 -- | Write sequence and quality data simulatnously
  104 --   This may be more laziness-friendly.
  105 writeFastaQual :: FilePath -> FilePath -> [Sequence] -> IO ()
  106 writeFastaQual f q ss = do
  107   hf <- openFile f WriteMode
  108   hq <- openFile q WriteMode
  109   hWriteFastaQual hf hq ss
  110   hClose hq
  111   hClose hf
  112 
  113 hWriteFastaQual :: Handle -> Handle -> [Sequence] -> IO ()
  114 hWriteFastaQual hf hq = mapM_ wFQ
  115     where wFQ s = wFasta hf s >> wQual hq s
  116 
  117 -- | Lazily read sequence from handle
  118 hReadFasta :: Handle -> IO [Sequence]
  119 hReadFasta h = do
  120   ls <- hGetLines h
  121   return (mkSeqs ls)
  122 
  123 -- | Write sequences in FASTA format to a handle.
  124 hWriteFasta :: Handle -> [Sequence] -> IO ()
  125 hWriteFasta h = mapM_ (wFasta h)
  126 
  127 wHead :: Handle -> SeqData -> IO ()
  128 wHead h l = do
  129    B.hPut h $ B.pack ">"
  130    B.hPut h l
  131    B.hPut h $ B.pack "\n"
  132 
  133 wFasta :: Handle -> Sequence -> IO ()
  134 wFasta h (Seq l d _) = do
  135   wHead h l
  136   let ls = splitsAt 60 d
  137   mapM_ (B.hPut h) $ intersperse (B.pack "\n") ls
  138   B.hPut h $ B.pack "\n"
  139 
  140 hWriteQual :: Handle -> [Sequence] -> IO ()
  141 hWriteQual h = mapM_ (wQual h)
  142 
  143 wQual :: Handle -> Sequence -> IO ()
  144 wQual h (Seq l _ (Just q)) = do
  145   wHead h l
  146   let qls = splitsAt 20 q
  147       qs = B.pack . unwords . map show . BB.unpack
  148   mapM_ ((\l' -> B.hPut h l' >> B.hPut h (B.pack "\n")) . qs) qls
  149 wQual _ (Seq _ _ Nothing) = return ()
  150 
  151 -- ByteString operations
  152 -- These aren't (or possible weren't) provided by the FPS library.
  153 -- Implement line-based IO (in retrospect it'd be simpler
  154 -- and better to just use 'lines'.)
  155 
  156 -- lazily read lines from file
  157 getLines :: FilePath -> IO [ByteString]
  158 getLines f = do
  159   h <- openFile f ReadMode
  160   hGetLines' (hClose h) h
  161 
  162 -- lazily read lines from handle
  163 hGetLines :: Handle -> IO [ByteString]
  164 hGetLines = hGetLines' (return ())
  165 
  166 -- add an optional handle-closing parameter
  167 hGetLines' :: IO () -> Handle -> IO [ByteString]
  168 hGetLines' c h = do
  169   e <- hIsEOF h
  170   if e then c >> return []
  171        else do l' <- BS.hGetLine h
  172                let l = B.fromChunks $ if BS.null l' then [] else  [l']
  173                ls <- unsafeInterleaveIO $ hGetLines' c h
  174                return (l:ls)
  175 
  176 -- | Convert a list of FASTA-formatted lines into a list of sequences.
  177 --   Blank lines are ignored.
  178 --   Comment lines start with "#" are allowed between sequences (and ignored).
  179 --   Lines starting with ">" initiate a new sequence.
  180 mkSeqs :: [ByteString] -> [Sequence]
  181 mkSeqs = map mkSeq . blocks
  182 
  183 mkSeq :: [ByteString] -> Sequence
  184 mkSeq (l:ls) = Seq (B.drop 1 l) (B.concat $ takeWhile isSeq ls) Nothing
  185     where isSeq s = (not . B.null) s && ((flip elem) (['A'..'Z']++['a'..'z']) . B.head) s
  186 mkSeq [] = error "empty input to mkSeq"
  187 
  188 mkQual :: [ByteString] -> [Sequence]
  189 mkQual = map f . blocks
  190     where f (l:ls) = Seq (B.drop 1 l) B.empty
  191                      (Just $ BB.pack (map int (B.words $ B.unlines ls)))
  192           f [] = error "mkQual: empty quality data"
  193           int = fromIntegral . fst . fromJust' . B.readInt
  194           fromJust' = maybe (error "Error in qual format") id
  195 
  196 -- | Split lines into blocks starting with '>' characters
  197 --   Filter out # comments (but not semicolons?)
  198 blocks :: [ByteString] -> [[ByteString]]
  199 blocks = groupBy (const (('>' /=) . B.head)) . filter ((/='#') . B.head) . filter (not . B.null)
  200 
  201 countSeqs :: FilePath -> IO Int
  202 countSeqs f = do
  203   ss <- B.readFile f
  204   let hdrs = filter (('>'==).B.head) $ filter (not . B.null) $ B.lines ss
  205   return (length hdrs)
  206