1 {- | 
    2    Module: Bio.Sequence.Phd
    3 
    4    Parse phd files (phred base calling output).
    5 -}
    6 
    7 module Bio.Sequence.Phd (readPhd,hReadPhd) where
    8 
    9 import Bio.Sequence.SeqData
   10 
   11 import qualified Data.ByteString.Lazy.Char8 as B
   12 import qualified Data.ByteString.Lazy as BB
   13 import qualified Data.ByteString as BBB  -- this is getting silly!
   14 
   15 import System.IO
   16 
   17 -- | Parse a .phd file, extracting the contents as a Sequence
   18 readPhd :: FilePath -> IO Sequence
   19 readPhd f = return . mkPhd =<< B.readFile f
   20 
   21 -- | Parse .phd contents from a handle
   22 hReadPhd :: Handle -> IO Sequence
   23 hReadPhd h = return . mkPhd =<< B.hGetContents h
   24 
   25 -- | The actual phd parser.
   26 
   27 --  aesthetics is not a major design goal...
   28 --  but error checking really should have been.  Sigh.
   29 mkPhd :: B.ByteString -> Sequence
   30 mkPhd inp = 
   31   let (hd:fs) = filter (not . B.null) . B.lines $ inp
   32       (comment,sd) = break (==B.pack "BEGIN_DNA") fs
   33       label = B.drop 15 hd
   34       fields = B.words . B.unlines
   35                . filter (not . isSubstr (B.pack "_COMMENT")) $ comment
   36       sdata = filter ((==3).length) . map B.words $ sd
   37       err = error "failed to parse quality value"
   38       qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata]
   39   in qual `seq` (Seq (compact $ B.unwords (label:fields)) 
   40       (compact $ B.concat $ map head sdata)
   41       (Just qual))
   42 --   Todo: check that we start with "BEGIN_SEQUENCE", and that we
   43 --   have a BEGIN_DNA/END_DNA region there.
   44 
   45 isSubstr :: B.ByteString -> B.ByteString -> Bool
   46 isSubstr s = any (B.isPrefixOf s) . B.tails
   47 
   48 -- | Pack bytestring segments into a single bytestring
   49 --   Allows the (rest of the) file contents to be GC'ed
   50 compact :: B.ByteString -> B.ByteString
   51 compact = B.fromChunks . return . BBB.concat . B.toChunks