1 {- | Implement clustering
    2 
    3 -}
    4 
    5 module Bio.Clustering where
    6 
    7 import qualified Data.Set as S
    8 import Data.List (partition,foldl')
    9 
   10 -- | Data structure for storing hierarchical clusters
   11 data Clustered score datum = Branch score (Clustered score datum) (Clustered score datum)
   12                            | Leaf datum  deriving Show
   13 
   14 -- | Single linkage agglomerative clustering.
   15 --   Cluster elements by slurping a sorted list of pairs with score (i.e. triples :-)
   16 --   Keeps a set of contained elements at each branch's root, so O(n log n),
   17 --   and requires elements to be in Ord.
   18 --   For this to work, the triples must be sorted on score. Earlier scores in the list will
   19 --   make up the lower nodes, so sort descending for similarity, ascending for distance.
   20 cluster_sl  :: (Ord a, Ord s) => [(s,a,a)] -> [Clustered s a]
   21 cluster_sl = map fst . foldl' csl []
   22     where csl cs (s,a,b) = 
   23               -- can be short circuited for more performance
   24               let (acs,tmp)   = partition (\(_,objs) -> a `S.member` objs) cs
   25                   (bcs,rest)  = partition (\(_,objs) -> b `S.member` objs) tmp
   26               in case (acs,bcs) of 
   27                    ([(ac,ao)],[(bc,bo)]) -> (Branch s ac bc,S.union ao bo):rest
   28                    ([(ac,ao)],[])        -> if b `S.member` ao then (ac,ao):rest
   29                                             else (Branch s ac (Leaf b),S.insert b ao):rest
   30                    ([],[(bc,bo)])        -> if a `S.member` bo then (bc,bo):rest
   31                                             else (Branch s bc (Leaf a),S.insert a bo):rest
   32                    ([],[])               -> (Branch s (Leaf a) (Leaf b),S.fromList [a,b]):rest
   33                    _                     -> error "Grave mistake"
   34 
   35 
   36 -- cluster_gen :: [a] -> (a->a->Bool) ->