-- count homopolymer distributions in a fasta file
module HPLCount where

import Bio.Core.Sequence
import Bio.Sequence.Fasta
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Map as M
import Text.Printf
import Data.Char (toUpper)
import Data.List (intersperse)
import Data.Maybe (fromJust)
import System.Environment

type HPLprob = Char -> Int -> Double

-- lookup and return probability of this hpl length for this letter
mkHPLprobs :: M.Map (Char,Int) Int -> HPLprob
mkHPLprobs ft = let totals = [(c, fromIntegral $ sum [M.findWithDefault 0 (c,x) ft | x <- [0..50]]) | c <- "ACGT" ]
                in \ch n -> fromIntegral (M.findWithDefault 0 (ch,n) ft) / fromJust (lookup ch totals)

main :: IO ()
main = do
  [f] <- getArgs
  display "tacg" . freqtable . concatMap (hpls "tacg" . unSD . seqdata) =<< readFasta f

display :: [Char] -> M.Map (Char,Int) Int -> IO ()
display fs m = let imax = maximum $ map snd $ M.keys m
                   header = concat ("len\t   " : intersperse "\t   " (map (return . toUpper) fs))
               in putStrLn $ unlines (header : [ printf "%3d" i ++ concat [ printf "\t%5d" (M.findWithDefault 0 (c,i) m) |  c <- map toUpper fs ] | i <- [0..imax]])

freqtable :: [(Char,Int)] -> M.Map (Char,Int) Int
freqtable = go M.empty 
  where
    go m [] = m
    go m (i:is) = let v = M.findWithDefault 0 i m 
                      v' = v+1
                      m' = M.insert i v' m
                  in v' `seq` m' `seq` go m' is
  
hpls :: [Char] -> B.ByteString -> [(Char,Int)]
hpls xs = go (cycle xs')
  where xs' = map toUpper xs
        go (f:fs) sd | B.null sd = []
                     | not (elem (toUpper $ B.head sd) xs') = go (f:fs) (B.tail sd)
                     | otherwise = let (this,rest) = B.span ((==f).toUpper) sd
                                   in (f,fromIntegral (B.length this)) : go fs rest
        go [] _ = error "should never happen"