-- split bam into matching and unmatching

module Main where

import Bio.SamTools.Bam
import System.Environment (getArgs)
import Data.List (isPrefixOf)
import System.IO
import System.IO.Unsafe
import qualified Data.ByteString.Char8 as BS

-- unordered-containers
import qualified Data.HashMap.Strict as M
import Data.IORef

-- ugly with global data, but... used in decide
cache_l, cache_r :: IORef (M.HashMap BS.ByteString (BS.ByteString,BS.ByteString,BS.ByteString))
cache_l = unsafePerformIO $ newIORef M.empty
cache_r = unsafePerformIO $ newIORef M.empty

main :: IO ()
main = do
  [f] <- getArgs
  let (o1,o2) = outnames f
  -- fixme: use withFile instead
  h1 <- openFile o1 WriteMode 
  h2 <- openFile o2 WriteMode 
  withBamInFile f (splitBam $ decide h1 h2)
  hFlush h1
  hFlush h2
  hClose h1
  hClose h2

-- Heavy lifting here.
splitBam :: (Bam1 -> IO ()) -> InHandle -> IO ()
splitBam out h = do
  b <- get1 h
  case b of Nothing -> return ()
            Just b' -> do
              out b'
              splitBam out h

extract :: Bam1 -> (BS.ByteString,(BS.ByteString,BS.ByteString,BS.ByteString))
extract b = ( queryName b
            , ( maybe (error "no sequence data") id $ {-# SCC "extract/querySeq" #-} querySeq b
              , maybe (error "no quality data") id $ {-# SCC "extract/queryQual" #-} queryQual b
              , if isRead1 b then BS.pack "/1" else if isRead2 b then BS.pack "/2" else BS.empty
            ))

decide :: Handle -> Handle -> Bam1 -> IO ()
decide left right b | isUnmap b && isMateUnmap b = return () -- discard
                    | isSecondary b              = return ()
                    | otherwise = let (qn,stuff) = extract b
                                  in if isRead1 b then do
                                       mr <- M.lookup qn `fmap` readIORef cache_r
                                       case mr of Nothing -> do
                                                    modifyIORef cache_l (M.insert qn stuff)
                                                  Just r -> do
                                                    modifyIORef cache_r (M.delete qn)
                                                    hPutFq left qn stuff
                                                    hPutFq right qn r
                                     else if isRead2 b then do
                                       ml <- M.lookup qn `fmap` readIORef cache_l
                                       case ml of Nothing -> do
                                                    modifyIORef cache_r (M.insert qn stuff)
                                                  Just l -> do
                                                    modifyIORef cache_l (M.delete qn)
                                                    hPutFq left qn l
                                                    hPutFq right qn stuff
                                     else err b
                                     
err :: Bam1 -> IO ()
err b = hPutStrLn stderr ("couldn't classify read: "++ BS.unpack (queryName b))

hPutFq :: Handle -> BS.ByteString -> (BS.ByteString,BS.ByteString,BS.ByteString) -> IO ()
hPutFq h qn (qd,qq,end) = mapM_ (BS.hPut h) [at,qn,end,nl
                          ,qd,nl
                          ,plus,nl
                          ,qq,nl
                          ]
  where nl = BS.pack "\n"
        at = BS.pack "@"
        plus = BS.pack "+"
        -- gt = BS.pack ">"

outnames :: [Char] -> ([Char], [Char])
outnames fn = (basename fn ++ ".1.txt", basename fn ++ ".2.txt")
  where basename = reverse . takeWhile (/='/') . dropPrefix (reverse ".bam") . reverse
        dropPrefix pf str = if pf `isPrefixOf` str 
                            then drop (length pf) str else str
