\section{MaskCount}

Examine each nucleotide from two data sets, and count the Ns (or Xs?).
Separate into A AB B 0.  Also (optionally?) count masked region sizes
to generate gnuplot'able histograms.

TODO:
   - parameter to specify lower case/N/X as masked
   - justify columns in output
   - output percentages

\begin{code}
{-# LANGUAGE BangPatterns, FlexibleContexts #-}
module Main where

import Bio.Sequence

import Lib.TextFormat

import Control.Monad (when)
import Data.Char (isLower)
import Data.List (foldl',intersperse,partition)
import System.Environment (getArgs)
import System.Exit (exitWith,ExitCode(..))
import Text.Printf (printf, PrintfType)

main :: IO ()
main = do
       (opts,[a,b]) <- parseArgs
       as <- readFasta a
       bs <- readFasta b
       let res = zipComp as bs
       when (elem 'l' opts)
         (do
          putStrLn ("# fraction of masked positions: "++a++" both "++b++"none")
          putStrLn $ unlines $ map printres res)
       when (elem 's' opts)
         (do
          putStrLn "# Summary:"
          putStrLn $ printSum a b $ foldl' add (CS 0 0 0 0) (map snd res))

parseArgs :: IO (String, [String])
parseArgs = do
       as <- getArgs
       let (opts,files) = partition ((=='-').head) as
       when (length files /= 2) usage
       let short = concat $ map tail opts
       return (short,files)

usage :: IO a
usage = do putStrLn "mc: compare two FASTA files for masked nucleotides"
           putStrLn "usage: mc [-s|-l] file_1 file_2"
           exitWith $ ExitFailure 1

data CompStruct = CS !Int !Int !Int !Int

add :: CompStruct -> CompStruct -> CompStruct
add (CS a ab b o) (CS a' ab' b' o') = CS (a+a') (ab+ab') (b+b') (o+o')

-- construct the percentage similar nucleotides
printres :: (String, CompStruct) -> String
printres (s,CS a ab b o) = let tot = a+ab+b+o
    in concat $ intersperse " "
       $ s : map (percent tot) [a,ab,b,o]

printSum :: String -> String -> CompStruct -> String
printSum a b (CS ca cab cb co) = unlines (["Summary for "++a++" vs. "++b++":\n"]
   ++ columns " " [ map ("  "++) ["unmasked in both:","masked in both:"
                                 ,"masked in "++a++":","masked in "++b++":"]
                  , map show [co, cab, ca, cb]
                  , map (paren . percent tot) [co,cab,ca,cb ]])
   ++ "Pearson's phi: "++printf "%.3f" (phi :: Double)
  where tot = co+cab+ca+cb
        phi = let [a,b,ab,o] = map ((/(fromIntegral tot)) . fromIntegral) [ca,cb,cab,co]
              in (ab*o-a*b)/sqrt((ab+a)*(ab+b)*(a+o)*(b+o))

paren :: String -> String
paren s = "("++s++"%)"

percent :: (PrintfType (Double -> t), Integral a, Integral a1) => a1 -> a -> t
percent tot x = printf "%.2f" (100*fromIntegral x/fromIntegral tot :: Double)

zipComp :: [Sequence a] -> [Sequence a] -> [(String,CompStruct)]
zipComp as bs = map (z1 (CS 0 0 0 0)) $ zipSeqs as bs

z1 :: CompStruct -> (Sequence a, Sequence a) -> (String, CompStruct)
z1 cs (a,b) = (toStr $ seqlabel a, foldl' collect cs $ zip (get a) (get b))
    where get s = map isLower $ map (s!) [0..seqlength s-1]
                 -- s/isLower/isN/ for N detect.

zipSeqs :: [Sequence a] -> [Sequence a] -> [(Sequence a,Sequence a)]
zipSeqs [] [] = []
zipSeqs (a:as) (b:bs) =
    if seqlabel a == seqlabel b then (a,b):zipSeqs as bs
    else error ("sequence sets differ (offending sequences: '"
             ++ toStr (seqlabel a) ++ "' and '" ++ toStr (seqlabel b) ++"'.")
zipSeqs _ _ = error "extraneous sequences"

collect :: CompStruct -> (Bool, Bool) -> CompStruct
collect (CS a ab b o) (True,True)   = (CS a (ab+1) b o)
collect (CS a ab b o) (True,False)  = (CS (a+1) ab b o)
collect (CS a ab b o) (False,True)  = (CS a ab (b+1) o)
collect (CS a ab b o) (False,False) = (CS a ab b (o+1))

\end{code}
