1 {- | Implement clustering 2 3 -} 4 5 module Bio.Clustering where 6 7 import qualified Data.Set as S 8 import Data.List (partition,foldl') 9 10 -- | Data structure for storing hierarchical clusters 11 data Clustered score datum = Branch score (Clustered score datum) (Clustered score datum) 12 | Leaf datum deriving Show 13 14 -- | Single linkage agglomerative clustering. 15 -- Cluster elements by slurping a sorted list of pairs with score (i.e. triples :-) 16 -- Keeps a set of contained elements at each branch's root, so O(n log n), 17 -- and requires elements to be in Ord. 18 -- For this to work, the triples must be sorted on score. Earlier scores in the list will 19 -- make up the lower nodes, so sort descending for similarity, ascending for distance. 20 cluster_sl :: (Ord a, Ord s) => [(s,a,a)] -> [Clustered s a] 21 cluster_sl = map fst . foldl' csl [] 22 where csl cs (s,a,b) = 23 -- can be short circuited for more performance 24 let (acs,tmp) = partition (\(_,objs) -> a `S.member` objs) cs 25 (bcs,rest) = partition (\(_,objs) -> b `S.member` objs) tmp 26 in case (acs,bcs) of 27 ([(ac,ao)],[(bc,bo)]) -> (Branch s ac bc,S.union ao bo):rest 28 ([(ac,ao)],[]) -> if b `S.member` ao then (ac,ao):rest 29 else (Branch s ac (Leaf b),S.insert b ao):rest 30 ([],[(bc,bo)]) -> if a `S.member` bo then (bc,bo):rest 31 else (Branch s bc (Leaf a),S.insert a bo):rest 32 ([],[]) -> (Branch s (Leaf a) (Leaf b),S.fromList [a,b]):rest 33 _ -> error "Grave mistake" 34 35 36 -- cluster_gen :: [a] -> (a->a->Bool) ->