1  |
    2    This module contains a collection of functions for calculate hash values from nucleotide words.
    3 
    4    (Actually, it mostly packs nucleotide words from into integers, so it's a one-to-one relationship.)
    5    It is intended to be used for indexing large sequence collections.
    6 
    7 \begin{code}
    8 module Bio.Sequence.HashWord where
    9 
   10 import Bio.Sequence.SeqData
   11 import Data.List (sort)
   12 import Data.Char (toUpper)
   13 import Data.Int
   14 
   15 import qualified Data.ByteString.Lazy.Char8 as B
   16 
   17 -- | This is a struct for containing a set of hashing functions
   18 data HashF k = HF { hash   :: SeqData -> Offset -> Maybe k -- ^ calculates the hash at a given offset in the sequence
   19                  , hashes  :: SeqData -> [(k,Offset)]      -- ^ calculate all hashes from a sequence, and their indices
   20                  , ksort :: [k] -> [k]                     -- ^ for sorting hashes
   21                  }
   22 
   23 -- | Adds a default "hashes" function to a @HashF@, when "hash" is defined.
   24 genkeys :: HashF k -> HashF k
   25 genkeys kf = kf { hashes = gkeys }
   26     where gkeys s = mkkeys (hash kf $ s)  [0..B.length s-1]
   27           mkkeys f (p:ps) = case f p of Nothing -> mkkeys f ps
   28                                         Just k -> (k,p) : mkkeys f ps
   29           mkkeys _ [] = []
   30 
   31 \end{code}
   32 
   33 \subsection{Generating hashes}
   34 
   35 Contigous hashes.
   36 
   37 \begin{code}
   38 
   39 -- | Contigous constructs an int\/eger from a contigous k-word.
   40 contigous :: Integral k => Int -> HashF k
   41 contigous k' = let 
   42    k = fromIntegral k'
   43 
   44    c_key s i | k+i > B.length s = Nothing 
   45              | otherwise = let s' = B.take k $ B.drop i s 
   46                            in if B.find isN s' == Nothing then Just (n2k k' s')
   47                               else Nothing
   48    c_keys s = c_keys' 0 s
   49    c_keys' c s | k > B.length s = []
   50                | otherwise = case B.findIndex isN w0 of 
   51                                Nothing -> let cur = n2k (fromIntegral k) s
   52                                           in (cur,c) : c_keys'' cur c (B.drop k s)
   53                                Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
   54             where w0 = B.take k s
   55 
   56    c_keys'' cur p s | B.null s = []
   57                     | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
   58                     | otherwise = let new = (cur `mod` 4^(k-1))*4+val (B.head s)
   59                                   in (new,p+1) : c_keys'' new (p+1) (B.tail s)
   60 
   61               in HF { hash = c_key
   62                     , hashes  = c_keys
   63                     , ksort = Data.List.sort
   64                     }
   65 \end{code}
   66 
   67 \begin{code}
   68 
   69 -- | Like 'contigous', but returns the same hash for a word and its reverse complement.
   70 {-# SPECIALIZE rcontig :: Int -> HashF Int #-}
   71 rcontig :: Integral k => Int -> HashF k
   72 rcontig k' = let
   73    k = fromIntegral k'
   74 
   75    c_key s i | k+i > B.length s = Nothing 
   76              | B.find isN s' /= Nothing = Nothing
   77              | otherwise = Just $ min (n2k k' s') (n2k k' rs')
   78              where s' = B.take k $ B.drop i s
   79                    rs' = B.reverse $ B.map complement s'
   80 
   81    c_keys s = c_keys' 0 s
   82    c_keys' c s | k > B.length s = []
   83                | otherwise = case B.findIndex isN w0 of 
   84                                Nothing -> let c1 = n2k (fromIntegral k) w0
   85                                               c2 = n2k (fromIntegral k) w0r
   86                                           in (min c1 c2,c) : c_keys'' (c1,c2) c (B.drop k s)
   87                                Just i  -> c_keys' (c+i+1) (B.drop (i+1) s)
   88             where w0  = B.take k s
   89                   w0r = B.reverse $ B.map complement w0
   90 
   91    c_keys'' (c1,c2) p s | B.null s = []
   92                         | isN (B.head s) = c_keys' (p+k+1) (B.tail s)
   93                         | otherwise = let n1 = (c1 `mod` 4^(k-1))*4+val (B.head s)
   94                                           n2 = c2 `div` 4+4^(k-1)*(val . complement) (B.head s)
   95                                       in (min n1 n2,p+1) : c_keys'' (n1,n2) (p+1) (B.tail s)
   96 
   97               in HF { hash = c_key
   98                     , hashes  = c_keys
   99                     , ksort = Data.List.sort
  100                     }
  101 
  102 \end{code}
  103 
  104 \section{454-hashes}
  105 
  106 454 sequencing results in relatively few substitution errors, but has problems
  107 with mulitplicity of mononucleotides.  One option is to ignore multiplicity, and
  108 simply hash nucleotide changes.
  109 
  110 Note that genkeys won't work on this one.
  111 
  112 \begin{code}
  113 
  114 -- what about Ns?
  115 compact :: SeqData -> [SeqData]
  116 compact bs | B.null bs = []
  117            | otherwise = let (h,t) = B.break (/=B.head bs) bs in h : compact t
  118 
  119 -- | Like @rcontig@, but ignoring monomers (i.e. arbitrarily long runs of a single nucelotide
  120 --   are treated the same a single nucleotide.
  121 rcpacked :: Integral k => Int -> HashF k
  122 rcpacked k' = let
  123     k = fromIntegral k'
  124 
  125     c_key s i | k > B.length s' = Nothing
  126               | B.any isN s'    = Nothing
  127               | otherwise = Just $ min (n2k k' s') (n2k k' rs')
  128               where s1 = take k' $ compact $ B.drop i s
  129                     s' = fromStr $ map B.head s1
  130                     rs' = B.reverse $ B.map complement s'
  131 
  132     c_keys = c_keys' 0 . compact -- :: Integral k => SeqData -> [(k,Offset)]
  133 --    c_keys' :: Offset -> [SeqData] -> [(k,Offset)]
  134     c_keys' i ss | k' > length ss = []
  135                  | any isN $ map B.head $ take k' ss = c_keys' (i+(B.length . head) ss) (tail ss)
  136                  | otherwise = let s' = fromStr $ map B.head $ take k' ss
  137                                    key1 = n2k k' s'
  138                                    key2 = n2k k' (B.reverse $ B.map complement s')
  139                                in  (min key1 key2,i) : c_keys'' (key1,key2) (i+(B.length . head) ss) (tail ss)
  140 
  141     c_keys'' (c1,c2) i [] = []
  142     c_keys'' (c1,c2) i (s:ss) 
  143         | isN $ B.head s = c_keys' (i + (sum . map B.length) (s:take k' ss)) ss
  144         | otherwise = let s1 = B.head s
  145                           n1 = (c1 `mod` 4^(k-1))*4+val s1
  146                           n2 = c2 `div` 4+4^(k-1)*(val . complement) s1
  147                       in (min n1 n2,i) : c_keys'' (n1,n2) (i+B.length s) ss
  148 
  149              in HF { hash = c_key 
  150                    , hashes = c_keys
  151                    , ksort = Data.List.sort
  152                    }
  153 
  154 \end{code}
  155 
  156 \section{Gapped words}
  157 
  158 Shape uses a gapped shape specified via a string
  159 Note that this will be challenging to get symmetric, since
  160 an 'N' character may appear in the mask in only one direction.
  161 
  162 \begin{code}
  163 
  164 type Shape = String
  165 gapped :: Integral k => Shape -> HashF k
  166 gapped = error "gapped"
  167 
  168 \end{code}
  169 
  170 \subsection{Key arithmetic}
  171 
  172 A hash is an integer representing packed k-words.
  173 
  174 This is currently limited to the nucleotide alphabet, but could easily(?)
  175 be extended to arbitrary alphabets.  Ideally, it should also handle
  176 wildcards (i.e. inserting a position with multiple hashes)
  177 
  178 \begin{code}
  179 
  180 isN :: Char -> Bool
  181 isN = not . flip elem "ACGT" . toUpper
  182 
  183 -- conversion between integers (hashes) and strings
  184 -- n2k ignores contents beyond k chars
  185 n2k :: Integral k => Int -> SeqData -> k
  186 n2k k = n2i' 0 . B.take (fromIntegral k)
  187 
  188 n2i' :: (Num a) => a -> SeqData -> a
  189 n2i' acc gs = if B.null gs then acc else n2i' (4*acc + val (B.head gs)) (B.tail gs)
  190 
  191 -- inefficient, but not used much in critical code anyway
  192 k2n :: Integral k => Int -> k -> SeqData
  193 k2n k = fromStr . k2n' k
  194 
  195 k2n' k i = if k==1 then [unval i] 
  196            else let (q,r) = i `divMod` (4^(k-1)) in unval q : k2n' (k-1) r
  197 
  198 {-
  199 shiftRight, shiftLeft :: Integral k => Int -> Int -> k -> String -> k
  200 shiftRight k l pk str = (pk `mod` 4^l)*4^(k-l) + n2k 0 str
  201 shiftLeft = error "Keys.shiftLeft not implemented"
  202 -}
  203 
  204 val :: (Num t) => Char -> t
  205 val g = case toUpper g of {'A' -> 0; 'C' -> 1; 'G' -> 2; 'T' -> 3; 
  206                            _ -> error ("val: illegal character" ++ show g)}
  207 
  208 unval :: (Num a) => a -> Char
  209 unval i = case i of {0 -> 'A'; 1 -> 'C'; 2 -> 'G'; 3 -> 'T'; 
  210                           _ -> error ("unval: illegal value "++show i)}
  211 
  212 complement :: Char -> Char
  213 complement g = case g of {'A' -> 'T'; 'T' -> 'A'; 'a' -> 't'; 't' -> 'A';
  214                           'C' -> 'G'; 'G' -> 'C'; 'c' -> 'g'; 'g' -> 'c'; 
  215                           _ -> error ("Can't complement "++show g)}
  216 
  217 \end{code}