1 {- | 2 Module: Bio.Sequence.Phd 3 4 Parse phd files (phred base calling output). 5 -} 6 7 module Bio.Sequence.Phd (readPhd,hReadPhd) where 8 9 import Bio.Sequence.SeqData 10 11 import qualified Data.ByteString.Lazy.Char8 as B 12 import qualified Data.ByteString.Lazy as BB 13 import qualified Data.ByteString as BBB -- this is getting silly! 14 15 import System.IO 16 17 -- | Parse a .phd file, extracting the contents as a Sequence 18 readPhd :: FilePath -> IO Sequence 19 readPhd f = return . mkPhd =<< B.readFile f 20 21 -- | Parse .phd contents from a handle 22 hReadPhd :: Handle -> IO Sequence 23 hReadPhd h = return . mkPhd =<< B.hGetContents h 24 25 -- | The actual phd parser. 26 27 -- aesthetics is not a major design goal... 28 -- but error checking really should have been. Sigh. 29 mkPhd :: B.ByteString -> Sequence 30 mkPhd inp = 31 let (hd:fs) = filter (not . B.null) . B.lines $ inp 32 (comment,sd) = break (==B.pack "BEGIN_DNA") fs 33 label = B.drop 15 hd 34 fields = B.words . B.unlines 35 . filter (not . isSubstr (B.pack "_COMMENT")) $ comment 36 sdata = filter ((==3).length) . map B.words $ sd 37 err = error "failed to parse quality value" 38 qual = BB.fromChunks [BBB.pack . map (maybe err (fromIntegral . fst) . B.readInt . (!!1)) $ sdata] 39 in qual `seq` (Seq (compact $ B.unwords (label:fields)) 40 (compact $ B.concat $ map head sdata) 41 (Just qual)) 42 -- Todo: check that we start with "BEGIN_SEQUENCE", and that we 43 -- have a BEGIN_DNA/END_DNA region there. 44 45 isSubstr :: B.ByteString -> B.ByteString -> Bool 46 isSubstr s = any (B.isPrefixOf s) . B.tails 47 48 -- | Pack bytestring segments into a single bytestring 49 -- Allows the (rest of the) file contents to be GC'ed 50 compact :: B.ByteString -> B.ByteString 51 compact = B.fromChunks . return . BBB.concat . B.toChunks