
Calculate the contingency matrix for two clusterings.
For clusterings $C = {c_1,c_2,...c_n} and $K = {k_1, k_2,..k_m}$
the matrix is defined as $ a_ij = c_i U k_j $.

The matrix is calculated lazily, and arbitrary functions can
be applied to it to derive various scores: edit distance,
jaccard index, exact count, etc.

\begin{code}

module Main where

import qualified Data.ByteString.Lazy.Char8 as B

import Data.Map hiding (map,filter,null)
import qualified Data.Map as M
import Data.List
import Data.Maybe
import System.Environment (getArgs)
import Text.Printf

type ByteString = B.ByteString

main :: IO ()
main = do
   (a:args) <- getArgs
   a' <- B.readFile a
   let ref = mkC $ readC a'
       n   = maximum $ map length ("clustering":args)
   putStrLn (pad n "clustering" ++ "\tVI\tMIF\tJaccard\tRand\tFowMal\tSkew")
   mapM_ (comp n ref) args

pad :: Int -> String -> String
pad i s = s ++ replicate (i-length s) ' '

comp :: Int -> Clustering -> FilePath -> IO ()
comp n k f = do
    f' <- B.readFile f
    let mx = matrix k (readC f')
        ct = mkPairs k mx
        sh = printf "%.3f"
    putStrLn $ concat $ intersperse "\t" $ (pad n f) : map sh
            [fst $ iscore mx -- VI
            ,snd $ iscore mx -- MIF
            ,jaccard ct, rand ct, fowmal ct, skew ct] --  ++"\tmatrix: "++show ct)

type CID = Int

type Sequence     = ByteString            -- hash for efficiency?
type Cluster = [Sequence]

type Clustering   = (Map CID Cluster
                    ,Map Sequence CID)  -- optimization option

cid :: Clustering -> Sequence -> Maybe CID
cid k x = M.lookup x (snd k)

seqs :: Clustering -> CID -> Maybe Cluster
seqs k c = M.lookup c (fst k)

-- read a clustering from file
readC :: ByteString -> [Cluster]
readC = filter (not.null) . map B.words . B.lines

-- build the Clustering from a set of clusters
mkC :: [Cluster] -> Clustering
mkC  = foldr ins (M.empty,M.empty) . zip [0..]
  where ins (n,ss) (i2c,s2i) = (M.insert n ss i2c, foldr (\s -> M.insert s n) s2i ss)

\end{code}

Matrix construction.

\begin{code}

type Row    = [(Maybe CID,[Sequence])]
type Matrix = [Row]

-- reads a clustering and produces the confusion (-tingency) matrix
matrix :: Clustering -> [[Sequence]] -> Matrix
matrix c = map (matrixRow c)

matrixRow :: Clustering -> [Sequence] -> Row
matrixRow k c1 = comb $ sort $ map (\x -> (cid k x, [x])) c1 -- cid,seq
    where comb ((a,b):(c,d):xs) | a==c      = comb ((a,b++d):xs)
                                | otherwise = (a,b):comb ((c,d):xs)
          comb [(a,b)] = [(a,b)]
          comb []      = []

\end{code}

\subsection{Jaccard index, and friends}

Indices based on pair counts.  This includes jaccard, rand, ...?
"skew" is based on MacNemar's test.

\begin{code}

type Pairs = (Double,Double,Double,Double) -- a b c, and d

jaccard, rand, fowmal, hubert, skew :: Pairs -> Double
jaccard (a,b,c,_) = a/(a+b+c)
rand    (a,b,c,d) = (a+d)/(a+b+c+d)
fowmal  (a,b,c,_) = a/sqrt((a+b)*(a+c))  -- Fowlkes-Mallows index
-- this is the same as fowmal (why?)!
hubert  (a,b,c,d) = (a*d - b*c)/sqrt((a+b)*(c+d)*(a+c)*(b+d)) -- Gamma stat?

skew     (_,b,c,d) = (b-c)/sqrt(b+c)/sqrt d

mkPairs :: Clustering -> Matrix -> Pairs
mkPairs k m = let (a,b,c,ct) = foldl (jc1 k) (0,0,0,0) m
            in (a/2,b/2,c/2,(ct*(ct-1)-a-b-c)/2)

-- this is pretty ugly!
jc1 :: Clustering -> Pairs -> Row -> Pairs
jc1 k j row = let row' = filter (isJust.fst) row
                  l = sum $ map (len.snd) row'
              in foldr (jc2 k l) j row'

jc2 :: Clustering -> Double -> (Maybe CID,[Sequence]) -> Pairs -> Pairs
jc2 k lc (Just i,ss) (a,b,c,ct) = (a+ls*(ls-1),b+(ls*(lk-ls)),c+(ls*(lc-ls)),ct+ls)
    where ls = len ss
          lk = len $ fromJust (seqs k i)  -- fixme! only seqs found in C!

-- todo: accumulate column sizes until end?  how does clusqual deal?

\end{code}

\subsection{Entropy-based measures}

The \texttt{iscore} function calculates the variation of information
between the two clustrings. Given clusterings $C$ and $K$ matrix $H$, it is
calculated as:

\(
  \sum_C H_c\log H_c + \sum_K H_k\log H_k -2\sumH_{K,C} H_{k,c} \log H_{k,c}
\)

Todo: implement \[I/H = (H_c - H_k)/H_{c,k} - 1\]

I.e. collect H_k, H_c,k, and H_c, and use it to calculate both measures.

\begin{code}

type Imx = (Double,Double,Double,Double)

-- iscore calculates mutual information fraction and variation of information
iscore :: Matrix -> (Double,Double) -- MIF and VI
iscore mx = let (n,hc,hk,hck) = iscores empty (0,0,0,0) mx
                vi  = 1/n*(-2*hck+hc+hk)
                mif = (hc+hk-2*mlog n)/(hck-mlog n)-1
            in (vi,mif)

-- iscores - calculate n, H_c, H_k, H_c,k
iscores:: Map Int Double -> Imx -> Matrix -> Imx
iscores m (n,hc,hk,hck) (r:rs) =
    let r' = map (len.snd) $ filter (isJust.fst) r
    in iscores (accum m r)
               (n+sum r',hc,hk+mlog (sum r'),hck+sum (map mlog r')) rs
iscores m (n,_hc,hk,hck) [] =
    (n,(sum $ map mlog $ elems m),hk,hck)

-- accumulate the column sums for the matrix.  This must be done
-- explicitly, since one clustering may contain sequences not present
-- in the other
accum :: Map Int Double -> Row -> Map Int Double
accum m' row = foldr accum1 m' row
   where accum1 (Just c,ss) m = insertWith (+) c (len ss) m
         accum1 (Nothing,_)   m = m

mlog :: Double -> Double
mlog 0 = 0 -- error "log 0!"
mlog x = x*log x/log 2

len :: [a] -> Double
len = fromIntegral . length

\end{code}

