
module BloomFilter (BloomFilter, mkBloomFilter, matches, matchcount
                   , k, p, skipAmount, genfilter) where

import Bio.Sequence
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.ByteString.Lazy as BW

-- use real bloom filters
import qualified Data.BloomFilter as BF
import qualified Data.BloomFilter.Hash as BFH
import qualified Data.BloomFilter.Easy as BFE

import System.IO

k, skipAmount :: Num a => a
k = 30 -- word size
skipAmount = 6

p :: Double
p = 0.005

skip :: [a] -> [a]
skip [] = []
skip (x:xs) = x : skip (drop (skipAmount-1) xs)

toUpper c | c >= 97 && c <= 122 = c - 32
          | otherwise           = c

swords :: Sequence -> [SeqData]
swords s = take (fromIntegral (seqlength s)+1-k) . map (B.take k) . B.tails . BW.map toUpper $ seqdata s -- extract words

type BloomFilter = BFE.Bloom SeqData
-- fromList = BFE.easyList p . skip

genfilter :: FilePath -> IO BloomFilter
genfilter f = do
  h <- openFile f ReadMode
  size <- hFileSize h
  hClose h
  return . mkBloomFilter size =<< readFasta f

-- rip from Data.BloomFilter.Easy:
fromList :: Integer -> [SeqData] -> BloomFilter
fromList capacity = BF.fromListB (BFH.cheapHashes numHashes) numBits . skip 
    where (numBits, numHashes)
              | capacity > 0 = BFE.suggestSizing (fromIntegral capacity `div` skipAmount) p
              | otherwise    = (1, 1)

member :: SeqData -> BloomFilter -> Bool
member = BFE.elemB

-- | Generate a filter for a set of sequences.
mkBloomFilter :: Integer -> [Sequence] -> BloomFilter
mkBloomFilter size = fromList size . concatMap swords

-- | Count the number of words from the sequence that are found in the bloomfilter.
--   NB! use both directions!
matches :: BloomFilter -> Sequence -> [Bool]
matches bf s = let fwd = map (`member` bf) (swords s)
                   rev = map (`member` bf) (swords (revcompl s))
               in zipWith (||) fwd (reverse rev)

matchcount :: BloomFilter -> Sequence -> Int
matchcount bf s = fromIntegral $ count 0 $ matches bf $ s
    where 
      -- don't count overlapping hits (a bit crude, but works)
      count :: Int -> [Bool] -> Int
      count a (True:xs) = a `seq` count (a+1) (drop (skipAmount-1) xs)
      count a (False:xs) = count a xs
      count a [] = a
