hunk ./flower.cabal 47 -Executable frecover - Main-Is: FRecover.hs - Hs-Source-Dirs: src - Build-Depends: bio >= 0.4.9, base >= 3 && < 5 - Ghc-Options: -Wall - -Executable frename - Main-Is: FRename.hs - Hs-Source-Dirs: src - Build-Depends: bio >= 0.4.9, base >= 3 && < 5, bytestring >= 0.9.1 - Ghc-Options: -Wall - hunk ./src/FRecover.hs 1 - -module Main where -import Bio.Sequence.SFF -import System.Environment (getArgs) - -main :: IO () -main = mapM_ recoverFile =<< getArgs - where recoverFile f = writeSFF (f++"_recovered") =<< recoverSFF f - rmfile ./src/FRecover.hs hunk ./src/FRename.hs 1 - -{-| - Rename reads in .SFF files to avoid name clashes. - Apparently, reads with the same name crashes Newbler, and is - in any case a bad idea. This ensures uniqueness by appending a serial number to each read name in a set of files. --} - -module Main where - -import Bio.Sequence.SFF -import System.Environment (getArgs) -import qualified Data.ByteString.Char8 as B - -main :: IO () -main = do - fs <- getArgs - if null fs then putStrLn "Usage: frename file1.sff [file2.sff ...]" - else renameSFFs fs - -renameSFFs :: [FilePath] -> IO () -renameSFFs = go 0 - where go _ [] = return () - go current (f:fs) = do - (SFF h rs) <- readSFF f - writeSFF ("r_"++f) (SFF h $ renameFrom current rs) - go (current+num_reads h) fs - renameFrom i rs = zipWith update [i..] rs - where update j r = let h = read_header r - rn = B.concat [read_name h, B.pack "_", B.pack (show j)] - in r { read_header = h { name_length = fromIntegral $ B.length rn, read_name = rn }} rmfile ./src/FRename.hs hunk ./flower.cabal 2 -Version: 0.7 +Version: 0.7.1 hunk ./flower.cabal 17 - The fselect executable extracts reads from SFF-files, generating a new - SFF-file with a subset of the reads based on various criteria. - . - Sometimes SFF files will appear to be corrupted, with all-zero blocks in the - file. The frecover program ignores these and tries to resync with the file after an - invalid region. This was likely a one-time bug in the 454 software, so this program - is probably not so useful any more. - + The flowselect executable extracts reads from SFF-files, generating a new + SFF-file with a subset of the reads based on various quality criteria. + . + The flowt program removes (artificial) duplicates from SFF files. It's currently a work + in progress, but included if you'd like to play with it. It's faster than other approaches + (e.g. CD-HIT), and ought to be more sensitive and specific, but this needs to be proven. hunk ./src/Print.hs 21 - (a,r) -> putInt a `append` putInt3 r + (a,r) -> putInt a `append` putInt3_0 r hunk ./src/Print.hs 33 -ints, int2s, int3s :: Array Int ByteString +putInt3_0 :: Int -> Builder +putInt3_0 x | x<1000 = fromByteString (int3s0!x) + | otherwise = fromByteString (pack "xxx") + +ints, int2s, int3s, int3s0 :: Array Int ByteString hunk ./src/Print.hs 41 +int3s0 = listArray (0,999) (map (pack . ('0':) . ('0':) . show) [0..9::Int] ++ map (pack . ('0':) . show ) [10..99::Int] ++ map (pack . show) [100..999::Int])