-- | Support for reading various data 
module Formats where
       
import Text.Regex
import Data.List (elemIndex,sortBy)
import Data.Map hiding (map)

type LibTable = [(Regex,String)]  -- | pattern and name
type Cluster = (String,[String])  -- | name and list of reads

-- --------------------------------------------------
-- | Classify a read according to the LibTable
classify :: LibTable -> String -> String
classify ps str = case concatMap (class1 str) ps of
                    [] -> error ("no match for "++str++" in library table")
                    [x] -> x
                    s@(_:_) -> error ("multiple matches for "++str++": "++show s)
    where class1 st (r,s) = maybe [] (const [s]) (matchRegex r st)

-- --------------------------------------------------
-- | Read a LibTable from a file of wsp-separated columns, first line is
--   a header, and only columns "Pattern" and "Name" are used.
readPatternTable :: FilePath -> IO LibTable
readPatternTable f = do
  (h:ls) <- return . map words . lines =<< readFile f
  let (p,n) = case (elemIndex "Pattern" h, elemIndex "Name" h) of
                (Just x,Just y) -> (x,y)
                _ -> error ("Need both 'Pattern' and 'Name' headers in '"++f++"'")
      z l | length l < max p n = error ("Line in library table too short:\n"++show l)
          | otherwise          = (mkRegex (l!!p),l!!n)
  return $ map z ls

-- --------------------------------------------------
-- | Acquire total counts by library from a clustering
totalsByLib :: LibTable -> FilePath -> IO (String,[Int])
totalsByLib lt f = do 
  cs <- readClusters f
  let unify cs = [("total", concatMap snd cs)]
  case countClusters lt (unify cs) of
    [x] -> return x
    _   -> error "totalsByLib: this is impossible"

-- count by classification
countClusters :: LibTable -> [Cluster] -> [(String,[Int])]
countClusters lt = map count1
    where count1 (c,ss) = (c,map (\k -> findWithDefault 0 k (counts ss)) (map snd lt))
          counts ss = fromListWith (+) $ zip (map (classify lt) ss) $ repeat 1

-- --------------------------------------------------
-- | Parse clusters
readClusters :: FilePath -> IO [Cluster]
readClusters f = readFile f >>= return . rc . lines
    where rc []          = []
          rc [_]         = error "odd number of cluster lines!"
          rc (c:ss:rest) = case head $ words c of
                              ('>':name) -> (name,words ss) : rc rest
                              _          -> error ("Cluster '"++c++"' didn't start with '>'")
