{- IDEA: annotate a transcript by doing blastx vs NR, then blastp NR vs SwissProt
Align transcript vs SP by mapping each position via NR hits to positions in SP.
How to test?
Generalize:
annotate a_vs_b.xml b_vs_c.xml c_vs_d.xml => result: d_vs_a (or: a_vs_d?)
-}
{-# Language TypeFamilies #-}
module Align where
import qualified Data.IntMap.Strict as M
import Data.List (sort,sortBy)
import Data.Int (Int32)
type Pos = Int32
type Score = Float
collect_aligns :: (usid -> [(ssid, Alignment s q r)]) -> [(usid, Alignment s p q)] -> [(ssid,Alignment s p r)]
collect_aligns sp_lu ups = do -- list monad
(uhit,ualign) <- ups
(shit,salign) <- sp_lu uhit
return (shit,trans_align ualign salign)
-- A proper 'Alignment' links positions in one sequence to positions in another,
-- assigning a score to each link. The position links are co-linear and one-to-one.
data A = A {-# UNPACK #-} !Pos {-# UNPACK #-} !Pos {-# UNPACK #-} !Score deriving Show
type Alignment score pos1 pos2 = [A]
score :: Alignment s p q -> Score -- s
score = sum . map (\(A _ _ s) -> s)
-- | Reverse the alignment
invert :: Alignment s p q -> Alignment s q p
invert = map (\(A x y s) -> (A y x s))
-- | Follow positions via the first 'Alignment', through the second to
-- form the transitive alignment. Note that if the alignments have
-- nothing in common, this may produce an empty alignment.
trans_align :: Alignment s p q -> Alignment s q r -> Alignment s p r
trans_align x@((A xp xq xs):xx) y@((A yq yr ys):yy)
| xq == yq = (A xp yr (min xs ys)) : trans_align xx yy
| xq < yq = trans_align xx y
| xq > yq = trans_align x yy
trans_align _ _ = []
-- NOTE: this is exactly the >>> operator for the "Alignment s" arrow!
-- collect hits against the same sequence, and calculate a consensus alignment
merge_aligns :: (Show ssid, Ord ssid) => [(ssid,Alignment s p q)] -> [(ssid,Alignment s p q)]
merge_aligns = map merge_group . groups . filter is_not_empty
where is_not_empty = not . null . snd
merge_group :: (Show ssid) => (ssid, [Alignment s p q]) -> (ssid, Alignment s p q)
merge_group (tgt,as) = (tgt , go [] $ group_al''' as) -- $ map (\(p,xs) -> (p,collect xs))
where
-- this is just a regular local alignment using the
-- scores from the provided blast alignments. Using the (sparse) matrix from group_al,
-- no gap penalties, just the best increasing path
go [] [] = error ("last will fail! " ++ show tgt ++ " " ++ show as)
go [] ((p,xs):rest) = go (map (\(q,s)->(s,[(A p q s)])) xs) rest -- build the first columnt
go prev_col (qss:rest) = go (sort_column $ merge1 prev_col qss) rest -- merge, and sort by q - i.e. target position
go prev_col [] = reverse $ snd $ last $ prev_col -- done: return the resulting alignment
-- Merge two columns (the accumulated column and the next one) from the alignment DP matrix
-- The accumulated column is a list of (scores,partial alignment)-pairs. Each pair (cell) from the next
-- column is added to the highest scoring compatible entry in the accumulating column.
-- (t ~ Pos, p ~ Pos, q ~ Pos, a ~ Pos, s ~ Score, t1 ~ Score, Num t1, Ord t1, Ord a) =>
merge1 :: [(Score, [A])] -> (Pos, [(Pos, Score)]) -> [(Score, [A])]
merge1 [] _ = error "empty prev in merge1"
merge1 ((_, []) : _) (_, _ : _) = error "merge1: complex pattern missing"
merge1 ps (_,[]) = ps
merge1 prev@((ts,(A p1 q1 s1):qs1):pc) (p,(q,s):nc)
| q <= q1 = (s,[(A p q s)]) : continue -- q is the shortest alignment, and we just add this
| null pc = (ts+s,(A p q s):(A p1 q1 s1):qs1) : continue -- we exhausted prev, tack this pair on to the last entry
| otherwise = let (_,(A _ q2 _):_) = head pc -- now: let's see
in if q2 < q {- && ts2 > ts -}
then merge1 pc (p,(q,s):nc) -- prune this position, we're better off with the next one(?)
else (ts+s,(A p q s):(A p1 q1 s1):qs1) : continue -- this is the best candidate, add it here
where continue = merge1 prev (p,nc) -- recurse (so this is really just a fold)
-- warning: required invariant: total-score ts increases down the prev_column! (because the
-- q-position increases, and for any alignment to be useful, it must score higher than shorter
-- alternatives.)
-- sort on q-coordinate and filter so scores are increasing
sort_column :: [(Score, [A])] -> [(Score, [A])]
sort_column = filter_scores . sortBy (compare `on` qval)
where qval (_ts,(A _p q _s):_qs) = q
f `on` g = \x y -> f (g x) (g y)
filter_scores ((xs,xss):(ys,yss):rest)
| ys < xs = filter_scores ((xs,xss):rest)
| otherwise = (xs,xss):filter_scores ((ys,yss):rest)
filter_scores [x] = [x]
filter_scores [] = []
-- Add scores for all (q,s) pairs mapping to the same q. Input must be sorted on q.
collect :: [(Pos, Score)] -> [(Pos, Score)]
collect ((q1,s1):(q2,s2):rest)
| q1==q2 = collect ((q1,s1+s2):rest)
| q1 < q2 = (q1,s1) : collect ((q2,s2):rest)
| otherwise = error ("collect: unsorted input!\n"++show ((q1,s1):(q2,s2):take 2 rest))
collect [x] = [x]
collect [] = []
-- Group alignments against the same target
-- Basically a quicksort
groups :: Ord sid => [(sid, align)] -> [(sid,[align])]
groups ((s,a):xs) = this : groups less ++ groups more
where this = (s, a : map snd (filter ((s==).fst) xs))
less = filter ((s<).fst) xs
more = filter ((s>).fst) xs
groups [] = []
-- prop: length = length . concat . groups
-- Group alignments on first coordinate (p), using a direct merge of
-- the alignments. In other words, we take a list of input alignments, and populate a
-- a complete, sparse, DP matrix. (The group_alX functions, with X being a number
-- of primes, are semantically equivalent implementations.
group_al :: [Alignment s p q] -> [(Pos,[(Pos,Score)])]
group_al = join . foldr1 merge
where -- merge :: Alignment s p q -> Alignment s p q -> Alignment s p q
merge one@((A p1 q1 s1):one_rest) two@((A p2 q2 s2):two_rest)
| p1 < p2 = (A p1 q1 s1):merge one_rest two
| p1 == p2 = (A p1 q1 s1):(A p2 q2 s2):merge one_rest two_rest
| p1 > p2 = (A p2 q2 s2):merge one two_rest
merge [] xs = xs
merge xs [] = xs
join ((A p q s):xs) = go p [(q,s)] xs
go p0 acc ((A p q s):xs)
| p0 == p = go p0 ((q,s):acc) xs
| otherwise = (p0,sort acc) : join ((A p q s):xs)
go p0 acc [] = [(p0,sort acc)]
{-
-- Using a Map is slightly better than using quicksort OR mergesort(!)
group_al' :: (Ord s) => [Alignment s Int Int] -> [(Int,[(Int,s)])]
group_al' = merge . concatMap (map (\(p,q,s) -> (p,[(q,s)])))
where merge = M.toList . M.map sort . M.fromListWith (flip (++))
-- As alignmnents are all sorted already, quicksort (from groups) is a
-- *very* bad choice. This is hopelessly inefficient.
group_al'' :: (Ord p, Ord q, Ord s) => [Alignment s p q] -> [(p,[(q,s)])]
group_al'' = sort . map (\(x,ys) -> (x,sort ys)) . groups . map (\(p,q,s) -> (p,(q,s))) . concat
-}
-- Use a Map p (Map q s): this is faster and uses less memory than the others.
group_al''' :: [Alignment s p q] -> [(Pos,[(Pos,Score)])]
group_al''' = toList . M.unionsWith merge . map toMap
where merge = M.unionWith max -- (+)
toMap :: Alignment s p q -> M.IntMap (M.IntMap Score)
toMap = M.fromAscList . map (\(A p q s) -> (fromIntegral p,M.singleton (fromIntegral q) s))
toList :: M.IntMap (M.IntMap Score) -> [(Pos, [(Pos, Score)])]
toList = map fst2int . M.toAscList . M.map (map fst2int . M.toAscList)
fst2int (f,r) = (fromIntegral f,r)
-- testing
{-
a1, a2 :: Alignment Double Int Int
a1 = [(0,2,0.5),(1,3,0.4),(2,5,0.2)]
a2 = [(x,x+2,0.3) | x <- [0..3]]
test :: [(SeqId Int, Alignment Double Int Int)]
test = undefined "doit" [(undefined,a1)] (const [(undefined,a2),(undefined,a1)])
-- | Check colinearity and one-to-oneness by asserting that
-- both sets of positions are sorted and without duplicates.
-- This won't necessarily work for alignments of a sequence to its reverse
-- complement, one way to do it is to represent a reverse match to position
-- k as (negate k-1), essentially laying out the sequence as:
-- <-----gcat-- ==atgc====>
-- -10 .... -1 0123456789
-- We still need to check that an alignment is either
-- all negative, or all non-negative.
prop_alignment :: (Ord p, Ord q) => Alignment s p q -> Bool
prop_alignment a = isSorted (map first a) && isSorted (map second a)
where first (x,_,_) = x
second (_,y,_) = y
isSorted (x:y:rest) = x < y && isSorted (y:rest)
isSorted _ = True
-}