hunk ./clustertools.cabal 2 -Version: 0.1.2 +Version: 0.1.3 hunk ./clustertools.cabal 65 - Build-depends: bio>=0.3.3.4 + Build-depends: bio>=0.4 hunk ./src/Ace2Fasta.hs 19 -pad_all :: Assembly -> [Sequence] +pad_all :: Assembly -> [Sequence Nuc] hunk ./src/Ace2Fasta.hs 24 -pad :: Offset -> (Sequence,Gaps) -> Sequence +pad :: Offset -> (Sequence Nuc,Gaps) -> Sequence Nuc hunk ./clustertools.cabal 2 -Version: 0.1.3 +Version: 0.1.4 hunk ./clustertools.cabal 75 + Other-modules: Statistics hunk ./clustertools.cabal 36 -Tested-With: GHC==6.8.2 +Tested-With: GHC==6.10.4 hunk ./clustertools.cabal 45 - Build-Depends: base>3, containers, simpleargs>=0.1 + Build-Depends: base>3 && <4, containers, simpleargs>=0.1 hunk ./src/Statistics.hs 8 -choose :: Integer -> Integer -> Double -n `choose` k = (fromIntegral $ product [k+1..n])/(fromIntegral $ product [1..n-k]) +choose :: Integral i => i -> i -> Integer +n' `choose` k' = let (n,k) = (fromIntegral n', fromIntegral k') + in product [k+1..n] `div` product [1..n-k] hunk ./src/Statistics.hs 15 -binomial p n k | k <= n = (fromIntegral n `choose` fromIntegral k) * (p**fromIntegral k) * ((1-p)** fromIntegral (n-k)) +binomial p n k -- | p*fromIntegral n > 20 && (1-p)*fromIntegral n > 20 = normal_apporx p n k + | k <= n = fromIntegral (n `choose` k) * (p**fromIntegral k) * ((1-p)** fromIntegral (n-k)) hunk ./src/Statistics.hs 29 +{- crap! this won't work, you can't just sum normals, goddamit. +normal_approx p n k = pdf_normal (fromIntegral n*p) (fromIntegral n*p*(1-p)) (fromIntegral k) +pdf_normal mu sigma x = exp(negate((x-mu)^2/(2*sigma^2)))/(sigma*sqrt(2*pi)) + +invcumnorm mu sigma z = mu + search (-limit*sigma) (limit*sigma) + where search a b = let c = (a+b)/2 + cn = cumnorm 0 sigma c + in if abs (z - cn) < 10*epsilon || abs (a-b) < epsilon then c + else if cn > z then search a c + else search c b +-} addfile ./src/Formats.hs hunk ./clustertools.cabal 28 - clustering (TGICL-format), output a table of clusters with the library name - prepended to the sequences. + clustering (TGICL-format), output a table of cluster sizes per library. hunk ./clustertools.cabal 74 - Other-modules: Statistics + Other-modules: Statistics, Formats hunk ./src/ClusterLibs.hs 5 +module Main where + hunk ./src/ClusterLibs.hs 8 -import Text.Regex -import Data.List (elemIndex,sortBy,intersperse) -import Data.Map hiding (map) +import Data.List (intersperse) hunk ./src/ClusterLibs.hs 12 - -type LibTable = [(Regex,String)] -type Cluster = (String,[String]) +import Formats (LibTable, Cluster + , readPatternTable, totalsByLib, readClusters, countClusters, classify) hunk ./src/ClusterLibs.hs 25 --- counts is +-- | Using count by library, takes a cluster, calculates significance scores +-- and formats it for output. hunk ./src/ClusterLibs.hs 35 -{- - clus <- classClusters pat cs - writeTable clus --} - hunk ./src/ClusterLibs.hs 41 --- -------------------------------------------------- --- | Read a LibTable from a file of wsp-separated columns, first line is --- a header, and only columns "Pattern" and "Name" are used. -readPatternTable :: FilePath -> IO LibTable -readPatternTable f = do - (h:ls) <- return . map words . lines =<< readFile f - let (p,n) = case (elemIndex "Pattern" h, elemIndex "Name" h) of - (Just x,Just y) -> (x,y) - _ -> error ("Need both 'Pattern' and 'Name' headers in '"++f++"'") - z l | length l < max p n = error ("Line in library table too short:\n"++show l) - | otherwise = (mkRegex (l!!p),l!!n) - return $ map z ls - --- -------------------------------------------------- --- | Acquire total counts by library from a clustering -totalsByLib :: LibTable -> FilePath -> IO (String,[Int]) -totalsByLib lt f = do - cs <- readClusters f - let unify cs = [("total", concatMap snd cs)] - case countClusters lt (unify cs) of - [x] -> return x - _ -> error "totalsByLib: this is impossible" - --- -------------------------------------------------- --- | Parse clusters -readClusters :: FilePath -> IO [Cluster] -readClusters f = readFile f >>= return . rc . lines - where rc [] = [] - rc [_] = error "odd number of cluster lines!" - rc (c:ss:rest) = case head $ words c of - ('>':name) -> (name,words ss) : rc rest - _ -> error ("Cluster '"++c++"' didn't start with '>'") - -classify :: LibTable -> String -> String -classify ps str = case concatMap (class1 str) ps of - [] -> error ("no match for "++str++" in library table") - [x] -> x - s@(_:_) -> error ("multiple matches for "++str++": "++show s) - where class1 st (r,s) = maybe [] (const [s]) (matchRegex r st) - - hunk ./src/ClusterLibs.hs 54 --- count by classification -countClusters :: LibTable -> [Cluster] -> [(String,[Int])] -countClusters lt = map count1 - where count1 (c,ss) = (c,map (\k -> findWithDefault 0 k (counts ss)) (map snd lt)) - counts ss = fromListWith (+) $ zip (map (classify lt) ss) $ repeat 1 hunk ./src/Formats.hs 1 +-- | Support for reading various data +module Formats where + +import Text.Regex +import Data.List (elemIndex,sortBy) +import Data.Map hiding (map) + +type LibTable = [(Regex,String)] -- | pattern and name +type Cluster = (String,[String]) -- | name and list of reads + +-- -------------------------------------------------- +-- | Classify a read according to the LibTable +classify :: LibTable -> String -> String +classify ps str = case concatMap (class1 str) ps of + [] -> error ("no match for "++str++" in library table") + [x] -> x + s@(_:_) -> error ("multiple matches for "++str++": "++show s) + where class1 st (r,s) = maybe [] (const [s]) (matchRegex r st) + +-- -------------------------------------------------- +-- | Read a LibTable from a file of wsp-separated columns, first line is +-- a header, and only columns "Pattern" and "Name" are used. +readPatternTable :: FilePath -> IO LibTable +readPatternTable f = do + (h:ls) <- return . map words . lines =<< readFile f + let (p,n) = case (elemIndex "Pattern" h, elemIndex "Name" h) of + (Just x,Just y) -> (x,y) + _ -> error ("Need both 'Pattern' and 'Name' headers in '"++f++"'") + z l | length l < max p n = error ("Line in library table too short:\n"++show l) + | otherwise = (mkRegex (l!!p),l!!n) + return $ map z ls + +-- -------------------------------------------------- +-- | Acquire total counts by library from a clustering +totalsByLib :: LibTable -> FilePath -> IO (String,[Int]) +totalsByLib lt f = do + cs <- readClusters f + let unify cs = [("total", concatMap snd cs)] + case countClusters lt (unify cs) of + [x] -> return x + _ -> error "totalsByLib: this is impossible" + +-- count by classification +countClusters :: LibTable -> [Cluster] -> [(String,[Int])] +countClusters lt = map count1 + where count1 (c,ss) = (c,map (\k -> findWithDefault 0 k (counts ss)) (map snd lt)) + counts ss = fromListWith (+) $ zip (map (classify lt) ss) $ repeat 1 + +-- -------------------------------------------------- +-- | Parse clusters +readClusters :: FilePath -> IO [Cluster] +readClusters f = readFile f >>= return . rc . lines + where rc [] = [] + rc [_] = error "odd number of cluster lines!" + rc (c:ss:rest) = case head $ words c of + ('>':name) -> (name,words ss) : rc rest + _ -> error ("Cluster '"++c++"' didn't start with '>'") addfile ./src/Diversity.hs hunk ./src/Diversity.hs 1 - +-- | Calculate library diversity from a clustering and a library table + +module Main where + +import System.Environment (getArgs) +import Data.Set as S hiding (map) +import Data.Map as M hiding (map) +import Data.List (sort, intersperse) + +import Formats + +main = do + [lib,cs] <- getArgs -- catch etc + lt <- readPatternTable lib + tot <- totalsByLib lt cs + cls <- countClusters lt `fmap` readClusters cs + writeColumns lt $ mkColumns lt cls + +-- | Use cluster counts to generate a set of columns, indexed by library +mkColumns :: LibTable -> [(String,[Int])] -> M.Map String [Int] +mkColumns lt = M.fromListWith append . concatMap mkentry + where append old new = new++old + mkentry row = zip (map snd lt) (map return $ snd row) + +-- | ouput columns, in a format suitable for gnuplotting: +writeColumns lt cm = do + let cols = map ((flip (++) (repeat 0)) . reverse . sort . mylookup cm . snd) lt + rows = transpose cols + writeRow $ map snd lt + mapM_ writeRow $ map (map show) $ takeWhile (not . all (==0)) rows + +mylookup m v = case M.lookup v m of Just x -> x + Nothing -> error ("Couldn't find library "++show v + ++"\namong: "++show (M.keys m)) + +-- transpose a set of infinite lists +transpose xs = map head xs : transpose (map tail xs) +writeRow = putStrLn . concat . intersperse "\t" hunk ./src/Diversity.hs 27 - let cols = map ((flip (++) (repeat 0)) . reverse . sort . mylookup cm . snd) lt - rows = transpose cols + let cols = map (interpolate 1 . cumpercent . reverse . sort . mylookup cm . snd) lt + rows = transpose ([0..100]:cols) hunk ./src/Diversity.hs 30 - mapM_ writeRow $ map (map show) $ takeWhile (not . all (==0)) rows + mapM_ writeRow $ map (map show) $ rows hunk ./src/Diversity.hs 36 --- transpose a set of infinite lists +interpolate :: Double -> [(Double,Double)] -> [Double] +interpolate delta cs = go 0 cs + where go x xs@(a:b:rest) = if fst a<=x then if fst b>x then snd a + (x-fst a)*(snd b-snd a)/(fst b-fst a) : go (x+delta) xs + else go x (b:rest) + else error "interpolation failed" + go _ [(_,y)] = [y] -- last should always be 100% + +-- calculate cumulative percentages of total +cumpercent :: [Int] -> [(Double,Double)] +cumpercent xs = let total = fromIntegral $ sum xs + cum = scanl (+) 0 $ map ((/total) . (*100) . fromIntegral) xs + step = 100 / fromIntegral (length xs) + in zip (iterate (+step) 0) cum + +-- transpose a set of infinite/equally-sized lists hunk ./clustertools.cabal 80 - Build-depends: haskell98 + -- Build-depends: haskell98 hunk ./src/Xcerpt.lhs 9 -import System +import System.Environment (getArgs) hunk ./src/Xcerpt.lhs 23 - (opts,args) <- return . partition (\a->not (null a) && head a=='-') =<< System.getArgs + (opts,args) <- return . partition (\a->not (null a) && head a=='-') =<< getArgs hunk ./clustertools.cabal 80 - -- Build-depends: haskell98 hunk ./src/Xcerpt.lhs 11 -import IO +import System.IO hunk ./src/Diversity.hs 2 +-- TODO: remove zeros! hunk ./src/Diversity.hs 28 - let cols = map (interpolate 1 . cumpercent . reverse . sort . mylookup cm . snd) lt + let cols = map (interpolate 1 . cumpercent . reverse . sort . Prelude.filter (/=0) . mylookup cm . snd) lt