1 | 2 This module contains a collection of functions for calculate hash values from nucleotide words. 3 4 (Actually, it mostly packs nucleotide words from into integers, so it's a one-to-one relationship.) 5 It is intended to be used for indexing large sequence collections. 6 7 \begin{code} 8 module Bio.Sequence.HashWord where 9 10 import Bio.Sequence.SeqData 11 import Data.List (sort) 12 import Data.Char (toUpper) 13 import Data.Int 14 15 import qualified Data.ByteString.Lazy.Char8 as B 16 17 -- | This is a struct for containing a set of hashing functions 18 data HashF k = HF { hash :: SeqData -> Offset -> Maybe k -- ^ calculates the hash at a given offset in the sequence 19 , hashes :: SeqData -> [(k,Offset)] -- ^ calculate all hashes from a sequence, and their indices 20 , ksort :: [k] -> [k] -- ^ for sorting hashes 21 } 22 23 -- | Adds a default "hashes" function to a @HashF@, when "hash" is defined. 24 genkeys :: HashF k -> HashF k 25 genkeys kf = kf { hashes = gkeys } 26 where gkeys s = mkkeys (hash kf $ s) [0..B.length s-1] 27 mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps 28 Just k -> (k,p) : mkkeys f ps 29 mkkeys _ [] = [] 30 31 \end{code} 32 33 \subsection{Generating hashes} 34 35 Contigous hashes. 36 37 \begin{code} 38 39 -- | Contigous constructs an int\/eger from a contigous k-word. 40 contigous :: Integral k => Int -> HashF k 41 contigous k' = let 42 k = fromIntegral k' 43 44 c_key s i | k+i > B.length s = Nothing 45 | otherwise = let s' = B.take k $ B.drop i s 46 in if B.find isN s' == Nothing then Just (n2k k' s') 47 else Nothing 48 c_keys s = c_keys' 0 s 49 c_keys' c s | k > B.length s = [] 50 | otherwise = case B.findIndex isN w0 of 51 Nothing -> let cur = n2k (fromIntegral k) s 52 in (cur,c) : c_keys'' cur c (B.drop k s) 53 Just i -> c_keys' (c+i+1) (B.drop (i+1) s) 54 where w0 = B.take k s 55 56 c_keys'' cur p s | B.null s = [] 57 | isN (B.head s) = c_keys' (p+k+1) (B.tail s) 58 | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s) 59 in (new,p+1) : c_keys'' new (p+1) (B.tail s) 60 61 in HF { hash = c_key 62 , hashes = c_keys 63 , ksort = Data.List.sort 64 } 65 \end{code} 66 67 \begin{code} 68 69 -- | Like 'contigous', but returns the same hash for a word and its reverse complement. 70 {-# SPECIALIZE rcontig :: Int -> HashF Int #-} 71 rcontig :: Integral k => Int -> HashF k 72 rcontig k' = let 73 k = fromIntegral k' 74 75 c_key s i | k+i > B.length s = Nothing 76 | B.find isN s' /= Nothing = Nothing 77 | otherwise = Just $ min (n2k k' s') (n2k k' rs') 78 where s' = B.take k $ B.drop i s 79 rs' = B.reverse $ B.map complement s' 80 81 c_keys s = c_keys' 0 s 82 c_keys' c s | k > B.length s = [] 83 | otherwise = case B.findIndex isN w0 of 84 Nothing -> let c1 = n2k (fromIntegral k) w0 85 c2 = n2k (fromIntegral k) w0r 86 in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s) 87 Just i -> c_keys' (c+i+1) (B.drop (i+1) s) 88 where w0 = B.take k s 89 w0r = B.reverse $ B.map complement w0 90 91 c_keys'' (c1,c2) p s | B.null s = [] 92 | isN (B.head s) = c_keys' (p+k+1) (B.tail s) 93 | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s) 94 n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s) 95 in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s) 96 97 in HF { hash = c_key 98 , hashes = c_keys 99 , ksort = Data.List.sort 100 } 101 102 \end{code} 103 104 \section{454-hashes} 105 106 454 sequencing results in relatively few substitution errors, but has problems 107 with mulitplicity of mononucleotides. One option is to ignore multiplicity, and 108 simply hash nucleotide changes. 109 110 Note that genkeys won't work on this one. 111 112 \begin{code} 113 114 -- what about Ns? 115 compact :: SeqData -> [SeqData] 116 compact bs | B.null bs = [] 117 | otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t 118 119 -- | Like @rcontig@, but ignoring monomers (i.e. arbitrarily long runs of a single nucelotide 120 -- are treated the same a single nucleotide. 121 rcpacked :: Integral k => Int -> HashF k 122 rcpacked k' = let 123 k = fromIntegral k' 124 125 c_key s i | k > B.length s' = Nothing 126 | B.any isN s' = Nothing 127 | otherwise = Just $ min (n2k k' s') (n2k k' rs') 128 where s1 = take k' $ compact $ B.drop i s 129 s' = fromStr $ map B.head s1 130 rs' = B.reverse $ B.map complement s' 131 132 c_keys = c_keys' 0 . compact -- :: Integral k => SeqData -> [(k,Offset)] 133 -- c_keys' :: Offset -> [SeqData] -> [(k,Offset)] 134 c_keys' i ss | k' > length ss = [] 135 | any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss) 136 | otherwise = let s' = fromStr $ map B.head $ take k' ss 137 key1 = n2k k' s' 138 key2 = n2k k' (B.reverse $ B.map complement s') 139 in (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss) 140 141 c_keys'' (c1,c2) i [] = [] 142 c_keys'' (c1,c2) i (s:ss) 143 | isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss 144 | otherwise = let s1 = B.head s 145 n1 = (c1 `mod` 4^(k-1))*4+val s1 146 n2 = c2 `div` 4+4^(k-1)*(val . complement) s1 147 in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss 148 149 in HF { hash = c_key 150 , hashes = c_keys 151 , ksort = Data.List.sort 152 } 153 154 \end{code} 155 156 \section{Gapped words} 157 158 Shape uses a gapped shape specified via a string 159 Note that this will be challenging to get symmetric, since 160 an 'N' character may appear in the mask in only one direction. 161 162 \begin{code} 163 164 type Shape = String 165 gapped :: Integral k => Shape -> HashF k 166 gapped = error "gapped" 167 168 \end{code} 169 170 \subsection{Key arithmetic} 171 172 A hash is an integer representing packed k-words. 173 174 This is currently limited to the nucleotide alphabet, but could easily(?) 175 be extended to arbitrary alphabets. Ideally, it should also handle 176 wildcards (i.e. inserting a position with multiple hashes) 177 178 \begin{code} 179 180 isN :: Char -> Bool 181 isN = not . flip elem "ACGT" . toUpper 182 183 -- conversion between integers (hashes) and strings 184 -- n2k ignores contents beyond k chars 185 n2k :: Integral k => Int -> SeqData -> k 186 n2k k = n2i' 0 . B.take (fromIntegral k) 187 188 n2i' :: (Num a) => a -> SeqData -> a 189 n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs) 190 191 -- inefficient, but not used much in critical code anyway 192 k2n :: Integral k => Int -> k -> SeqData 193 k2n k = fromStr . k2n' k 194 195 k2n' k i = if k==1 then [unval i] 196 else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r 197 198 {- 199 shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k 200 shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str 201 shiftLeft = error "Keys.shiftLeft not implemented" 202 -} 203 204 val :: (Num t) => Char -> t 205 val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; 206 _ -> error ("val: illegal character" ++ show g)} 207 208 unval :: (Num a) => a -> Char 209 unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; 210 _ -> error ("unval: illegal value "++show i)} 211 212 complement :: Char -> Char 213 complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A'; 214 'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; 215 _ -> error ("Can't complement "++show g)} 216 217 \end{code}