-- convert .aln (clustalw output) to .txt (one sequence per line)
{-# Language OverloadedStrings #-}

module Clustal where

import qualified Data.ByteString.Lazy.Char8 as B
import Data.List (sortBy, group, sort)
import Data.Maybe (fromJust)

merge :: [[B.ByteString]] -> [B.ByteString]
merge xs | all null xs = [] 
         | any null xs = error "Uneven number of entries - truncated input?"
         | otherwise = let 
           hs = map (splitbs . head) xs
           in B.concat ((fst $ head hs):"\t":map snd hs) 
              : merge (map tail xs)
                       
-- note that clustalw pads to 35 with spaces, we replace the last with a tab
splitbs :: B.ByteString -> (B.ByteString, B.ByteString)
splitbs bs = case B.words bs of [a,b] -> (pad 34 a,b)
                                _ -> error ("More than two words on one line?\n"++B.unpack bs)
  where pad n x = B.append x (B.replicate (n - B.length x) ' ')

chunks :: [B.ByteString] -> [[B.ByteString]]
chunks = go [] 
  where go xs (x:y:zs)  
          | B.null y = reverse xs : go [] zs -- x is a line of spaces!
          | otherwise            = go (x:xs) (y:zs)
        go xs [_] = [reverse xs] -- end of file
        go xs []  = [reverse xs] -- shouldn't happen

col_freq :: [B.ByteString] -> [(Int,[(Char,Int)])]
col_freq = go 0 . map (snd . splitbs)
  where go i xs | all B.null xs = []
                | any B.null xs = error "Uneven lengths!"
                | otherwise     = let cs = map (fromJust . B.uncons) xs
                                    in freq i (map fst cs) : go (i+1) (map snd cs)

freq :: Int -> [Char] -> (Int,[(Char,Int)])
freq i = (\x -> (i,x)) . map count . sortlen . group . sort . filter (/='-')
  where sortlen = sortBy (\a b -> compare (length b) (length a))
        count x = (head x,length x)
