
RSelect - randomly select a number of sequences from a FASTA file

Usage: rselect m < file.seq  # prints m random sequences to stdout
       rselect m n < file.seq # print m of the first n seq.s (faster)

\begin{code}

module Main where

import Data.Set hiding (partition)
import System.Environment (getArgs)
import System.Random
import System.IO
import Data.List (isPrefixOf,partition)

import Bio.Sequence.Fasta (countSeqs)
import Bio.Sequence

import Unigene

main :: IO ()
main = do
  (opts,as) <- partition (isPrefixOf "-") `fmap` getArgs
  case as of [m,f] ->   select opts (read m) Nothing f
             [m,n,f] -> select opts (read m) (Just (read n)) f
             _ -> error "Usage: rselect [-r] n [m] input.seq > output.seq"

-- | Select m out of maybe n sequences from f
--   Not specifying a maximum increases memory consumption
select :: [String] -> Integer -> Maybe Integer -> FilePath -> IO ()
select os m mn f = do
  seed <- newStdGen
  hPutStrLn stderr ("Random seed: "++show seed)
  ss <- ugRead f
  n <- case mn of Nothing -> do hPutStrLn stderr "Counting sequences..."
                                n' <- countSeqs f
                                return (fromIntegral n')
                  Just n' -> return n'

  rds <- randoms `fmap` newStdGen  :: IO [Bool]
  let randomize_dirs = if "-r" `elem` os then zipWith maybe_reverse rds else id

      ds = if m>n then error "rselect: m is larger than n, that won't work."
           else if m==n then take (fromIntegral n) $ repeat 1
           else if m<=n `div` 2 then diffs $ pick m $ randomRs (1,n) seed
           else diffs $ invert n 1 $ pick (n-m) $ randomRs (1,n) seed
      out = Prelude.filter (not.Prelude.null) $ Prelude.map randomize_dirs $ choose ds ss
  n `seq` hPutStrLn stderr ("N = "++show n++", M = "++show m)
  mapM_ (\o -> putStrLn "# cluster delimiter" >> hWriteFasta stdout o)  out

maybe_reverse :: Bool -> Sequence -> Sequence
maybe_reverse b = if b then revcompl else id

-- | choose elements according to a list of deltas
choose :: Show a => [Integer] -> [[a]] -> [[a]]
choose = choose' []

choose' :: Show a => [a] -> [Integer] -> [[a]] -> [[a]]
choose' acc [] _ = [reverse acc]
choose' acc (0:ds) xss = choose' acc ds xss
choose' acc (d:ds) (xs:xss) = let ys = drop (fromIntegral d-1) xs
                             in if not (Prelude.null ys) then choose' (head ys:acc) ds (tail ys:xss)
                                else reverse acc : choose' [] ((d-fromIntegral (length xs)):ds) xss
choose' _ _ [] = error "rselect: cannot choose from empty list.\nPerhaps you specified 'n' too large?"

-- | convert indices to deltas
diffs :: [Integer] -> [Integer]
diffs xs = diffs' (0:xs)
    where
      diffs' (a:b:cs) = if b<=a then error ("diffs needs a monotonically increasing sequence"++show (take 10 (a:b:cs)))
                        else (b-a) : diffs' (b:cs)
      diffs' [_] = []
      diffs' [] = []

-- | convert a list to the list of missing values
invert :: Integer -> Integer -> [Integer] -> [Integer]
invert n i (a:cs) = [i..a-1] ++ invert n (a+1) cs
invert n i [] = [i..n]


-- | pick m unique integers
pick :: Integer -> [Integer] -> [Integer]
pick = pick' empty
    where pick' tmp 0 _ = elems tmp
          pick' tmp m (i:is) = if member i tmp then pick' tmp m is
                               else pick' (insert i tmp) (m-1) is

\end{code}
