{-# LANGUAGE DeriveDataTypeable #-}

module Main where

import Bio.Sequence.SFF hiding (trim)
import Bio.Util (countIO)
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as BC

import qualified Data.IntMap as M
import Data.IntMap (IntMap)

-- import Bloom
import qualified Data.IntSet as S
import Data.IntSet (IntSet)
import Text.Printf

import System.IO
import Control.Monad (when)

import System.Console.CmdArgs

type FingerPrints = IntSet
type DupMap = IntMap [ReadBlock]
  
data ResultList = Then ReadBlock ResultList | EndWith DupMap

splitRes :: ResultList -> ([ReadBlock], DupMap)
splitRes (Then x rs) = let (ys,e) = splitRes rs in (x:ys,e)
splitRes (EndWith e) = ([],e)

trim :: ReadBlock -> ReadBlock
trim = id -- trimFromTo 4 10000 <- this trims to last base called position! (use trimKey?)

data Options = O { thresh :: Double
                 , fplen  :: Int
                 , summarize :: FilePath
                 , clusters  :: Bool
                 , input  :: [FilePath]
                 } deriving (Data,Typeable,Show,Eq)

mymode :: Options
mymode = O { thresh = 50  &= help "similarity threshold"
           , fplen  = 20  &= help "fingerprint size"
           , summarize = def &= {- empty "-" &= -} help "output cluster summary"
           , clusters  = True &= help "output complete clusters"
           , input  = def  &= args &= typFile }
         &= program "flowt"
         &= summary "flowt v0.7 - filter out reads from duplicate clones in 454 sequencing."

vlog :: Bool -> String -> IO ()
vlog v s = when v (hPutStr stderr s >> hFlush stderr)

main :: IO ()
main = do
  opts <- cmdArgs mymode
  -- putStrLn $ show opts
  verb <- isLoud
  vlog verb "Building the fingerprint index"
  let sff = case input opts of [x] -> x; _ -> error "You need to specify a (single) file name!\n(or use 'flowt -h' for help)."
  dups <- mkbf opts sff
  vlog verb (seq dups "...done!")
  SFF hs rs <- readSFF sff
  let (uqs,ds) = splitRes $ filter_unique opts dups rs

  writeSFF' "unique.sff" =<< SFF hs `fmap`
    (if verb then countIO "Output unique: " "..done!" 100 else return) uqs

  when (not . null . summarize $ opts) (gensum (summarize opts) ds)

  let cg = concatMap (clusterGroups opts) (M.elems ds)
  writeSFF' "duplic.sff" =<< SFF hs `fmap` 
    (if verb then countIO "Output cluster reps: " "..done!" 100 else return)
    (map head cg)
  when (clusters opts) $ BC.writeFile "clusters.txt" $ 
    BC.unlines $ map (BC.unwords . map (read_name . read_header)) $ cg
  return ()

gensum :: String -> IntMap [ReadBlock] -> IO ()
gensum f ds = write $ unlines $ map showcluster $ M.assocs ds
  where write = if f == "-" then putStrLn else writeFile f
        showcluster (k,v) = let a = averageflow v 
                            in printf "%16x" k++":\t" ++ show (length v) ++ unwords (map (BC.unpack . read_name . read_header) v) ++ "\n" ++ 
                               unlines [let fs = flowgram (trim x) in printf "%6.1f " (dist a $ map fromIntegral fs) ++ concatMap (printf "%3d ") fs | x <- v]
        
filter_unique :: Options -> FingerPrints -> [ReadBlock] -> ResultList
filter_unique opts dups = go M.empty 
  where go dm (r:rs) = let fp = fingerprint (fplen opts) r 
                       in if fp `S.member` dups then let dm' = myinsert fp r dm in dm' `seq` go dm' rs
                          else r `Then` go dm rs
        go dm [] = EndWith dm
        myinsert fp r dm = let v = M.findWithDefault [] fp dm
                               v' = r:v
                           in v `seq` v' `seq` dm `seq` M.insert fp v' dm

mkbf :: Options -> FilePath -> IO FingerPrints
mkbf opts sff = do 
  SFF _ rs <- readSFF sff
  let go seen dup (fp:rest) = if fp `S.member` seen then go seen (S.insert fp dup) rest
                              else go (S.insert fp seen) dup rest
      go _ dup [] = dup
  return $ go S.empty S.empty $ map (fingerprint $ fplen opts) rs

{-
mkbf sff = do 
  SFF _ rs <- readSFF sff
  return $ snd $ mkFilters 1000000 $ map fingerprint rs
-}

-- | Calculating a fingerprint - basically just a hash of the first 20 elements of the flow_index.  On 64 bits, it is 
--   possible to use more, this is a sensitivity/specificity tradeoff.
fingerprint :: Int -> ReadBlock -> Int
fingerprint fpl = foldr (\x y -> y*3+x-1) 0 . map fromIntegral . B.unpack . B.take fpl . B.filter (/=0) . flow_index . trim
              -- trim == drop 4 for the TCAG key, then 3^20 ~ 2^32 for range (or should that be 4^16?)

-- | Align each cluster and merge reads that appear to be from the same clone.
cluster :: Options -> DupMap -> [ReadBlock]
cluster opts =  concatMap (map mergeCG . clusterGroups opts) . M.elems
  
mergeCG :: [ReadBlock] -> ReadBlock
mergeCG = head -- todo: build a consensus flowgram and base call it.

clusterGroups :: Options -> [ReadBlock] -> [[ReadBlock]]
clusterGroups opts (c:cs) = let (this,rest) = span ((<= thresh opts) . matches c) cs 
                       in (c:this) : clusterGroups opts rest
clusterGroups _ [] = []

-- crude flowgram match check, valid range?
matches :: ReadBlock -> ReadBlock -> Double
matches old new = let f = map fromIntegral . flowgram . trim in dist (f old) (f new)

dist :: [Double] -> [Double] -> Double
dist = (.) sum . zipWith (\x y -> (6*(x-y)/(x+y+4))^(2::Int))

averageflow :: [ReadBlock] -> [Double]
averageflow = go . map (map fromIntegral . flowgram . trim)
  where go [] = []
        go xs = avg (map head xs) : go (filter (not . null) $ map tail xs)
        avg xs = sum xs / fromIntegral (length xs)