
Extract sequences matching a list of labels

\begin{code}

module Main where

import Data.Set hiding (null,filter,partition)
import System.Environment (getArgs)
import Data.List (foldl',partition)
import System.IO

usage :: String
usage = "xcerpt [-r|-f] [<labels>] <input>\n\n" ++
        "where <labels> is the name of a file containing the IDs\n" ++
        "of sequences to extract from the <input> file.\n" ++
        "The default is to output matching sequences, -r outputs the non-matching sequences" ++
	"With -f, the result ends up in xcerpt.match and xcerpt.rest instead of standard output."

main :: IO ()
main =  do
	(opts,args) <- return . partition (\a->not (null a) && head a=='-') =<< getArgs
	let dict = case length args of 
                  1 -> hGetContents stdin
                  2 -> readFile (args!!0)
	          _ -> error usage
        d <- return . mkdict =<< dict
        (match_out,rest_out) <- case opts of 
                                  []     -> return (Just stdout,Nothing)
                                  ["-r"] -> return (Nothing,Just stdout)
                                  ["-f"] -> do m <- openFile "xcerpt.match" WriteMode 
                                               r <- openFile "xcerpt.rest" WriteMode
                                               return (Just m, Just r)
                                  _      -> error usage
        xcerpt d match_out rest_out (last args)
        maybe (return ()) hClose match_out
        maybe (return ()) hClose rest_out

mkdict :: String -> Set String
mkdict = foldl' (flip insert) empty . words

xcerpt :: Set String -> Maybe Handle -> Maybe Handle -> String -> IO ()
xcerpt dict m r input = do
		    i <- readFile input
		    xtr dict m r $ filter (\l->(not.null) l && head l /= '#') 
			    $ lines i

xtr _ m r [] = return ()
xtr _ _ _ [x] = error ("Odd number of lines?\n"++x)
xtr d m r (l1:ls) = if head l1 == '>' then
		       let f = if (drop 1 $ head $ words l1) `member` d
			       then m else r
			   in do
                              let (sequence,rest) = break ((=='>').head) ls
			      case f of Just x -> hPutStr x $ unlines (l1:sequence)
                                        Nothing -> return ()
			      xtr d m r rest
		       else error ("Not a FASTA header:\n"++l1)

\end{code}
