1 {- |
    2    This module implements the 2bit format for sequences.
    3 
    4    Based on: <http://genome.ucsc.edu/FAQ/FAQformat#format7>
    5    Note! the description is not accurate, it is missing a reserved word
    6          in each sequence record.
    7 
    8    There are also other, completely different ideas of the 2bit format, e.g.
    9       <http://jcomeau.freeshell.org/www/genome/2bitformat.html>
   10 -}
   11 
   12 
   13 module Bio.Sequence.TwoBit
   14    ( decode2Bit, read2Bit, hRead2Bit
   15    ) where
   16 
   17 import Bio.Sequence.SeqData
   18 import qualified Data.ByteString.Lazy as BB
   19 import qualified Data.ByteString.Lazy.Char8 as B
   20 
   21 import System.IO
   22 import Control.Monad
   23 
   24 import Data.Char
   25 import Data.Binary
   26 import Data.Int
   27 import Data.List
   28 import Data.Bits
   29 
   30 import Test.QuickCheck hiding (check)    -- QC 1.0
   31 -- import Test.QuickCheck hiding ((.&.)) -- QC 2.0
   32 
   33 type ByteString = B.ByteString
   34 
   35 -- constants
   36 default_magic, default_version :: Word32
   37 default_magic   = 0x1A412743
   38 default_version = 0
   39 
   40 -- binary extras
   41 check :: Monad m => (a -> Bool) ->  a -> m a
   42 check p x = if (p x) then return x else fail "check failed"
   43 
   44 bswap :: Integral a => Int -> a -> a
   45 bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (n-length s) 0
   46 
   47 bytes :: Integral a => a -> [Word8]
   48 bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256
   49                                  in if q == 0 && r == 0 then Nothing
   50                                     else Just (fromIntegral r,q))
   51 unbytes :: Integral a => [Word8] -> a
   52 unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral
   53 
   54 instance Arbitrary Word8 where
   55     arbitrary = choose (0,255::Int) >>= return . fromIntegral
   56 
   57 prop_bswap :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
   58 prop_bswap a1 a2 a3 a4 = (bswap 4 . decode . BB.pack) [a1,a2,a3,a4] == ((decode . BB.pack) [a4,a3,a2,a1] :: Word32)
   59 
   60 -- basic data types
   61 data Header = Header { swap :: Bool,  version, count, reserved :: Word32 }
   62 instance Show Header where show (Header _ v c r) = "H "++show (v,c,r)
   63 
   64 instance Binary Header where
   65     get = do
   66       m <- get
   67       v <- get >>= check (==default_version)
   68       c <- get
   69       r <- get
   70       let s = if m==default_magic then id else if m==bswap 4 default_magic then bswap 4
   71                                                else error "2bit decode: incorrect magic number"
   72       return (Header (m/=default_magic) (s v) (s c) (s r))
   73     put (Header m v c r) = do
   74       put default_magic
   75       put default_version
   76       put c
   77       put (0 :: Word32)
   78 
   79 data Entry  = Entry  { name :: ByteString, offset :: Word32 }
   80               deriving Show
   81 
   82 swapEntry e = e { offset = bswap 4 (offset e) }
   83 
   84 instance Binary Entry where
   85     get = do
   86       len <- getWord8
   87       name <- replicateM (fromIntegral len) getWord8
   88       offset <- get
   89       return (Entry (BB.pack name) offset)
   90     put (Entry bs offset) = do
   91       let l = fromIntegral $ B.length bs :: Word8
   92       put l
   93       mapM_ put $ BB.unpack bs
   94       put offset
   95 
   96 data Entries = Entries Header [Entry]
   97 instance Show Entries where show (Entries h es) = unlines (show h : map show es)
   98 
   99 instance Binary Entries where
  100     get = do
  101       h <- get
  102       es <- replicateM (fromIntegral $ count h) get
  103       return (Entries h $ if swap h then map swapEntry es else es)
  104 
  105 {-
  106  dnaSize - number of bases of DNA in the sequence
  107  nBlockCount - the number of blocks of Ns in the file (representing unknown sequence)
  108  nBlockStarts - the starting position for each block of Ns
  109  nBlockSizes - the size of each block of Ns
  110  maskBlockCount - the number of masked (lower-case) blocks
  111  maskBlockStarts - the starting position for each masked block
  112  maskBlockSizes - the size of each masked block
  113  packedDna - the DNA packed to two bits per base
  114 -}
  115 
  116 data SR = SR { dnaSize, nBlockCount :: Word32
  117                          , nBlockStarts, nBlockSizes :: [Word32]
  118                          , maskBlockCount :: Word32
  119                          , maskBlockStarts, maskBlockSizes :: [Word32]
  120                          , packedDna :: [Word8]
  121                          , reserved2 :: Word32 }
  122         deriving Show
  123 
  124 -- big- and little-endian variants (what a mess)
  125 newtype SRBE = SRBE SR deriving Show
  126 newtype SRLE = SRLE SR deriving Show
  127 
  128 instance Binary SRBE where
  129     get = undefined
  130 
  131 instance Binary SRLE where
  132     get = do
  133       dz <- get :: Get Word32
  134       nc <- get :: Get Word32
  135       let n = fromIntegral $ bswap 4 nc
  136       nbs <- replicateM n get
  137       nbsz <- replicateM n get
  138       mc <- get :: Get Word32
  139       let m = fromIntegral $ bswap 4 mc
  140       mbs <- replicateM m get
  141       mbsz <- replicateM m get
  142       _reserved <- get :: Get Word32 -- !!!! oops?
  143       let d = fromIntegral $ bswap 4 dz
  144       pdna <- replicateM ((d+3) `div` 4)  get
  145       return (SRLE $ SR (bswap 4 dz) (bswap 4 nc)
  146               (map (bswap 4) nbs) (map (bswap 4) nbsz)
  147               (bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved))
  148     -- should this happen?  Why not just write default format?
  149     put (SRLE sr) = do
  150       put (bswap 4 $ dnaSize sr)
  151       put (bswap 4 $ nBlockCount sr)
  152       mapM_ (put . bswap 4) (nBlockStarts sr)
  153       mapM_ (put . bswap 4) (nBlockSizes sr)
  154       put (bswap 4 $ maskBlockCount sr)
  155       mapM_ (put . bswap 4) (maskBlockStarts sr)
  156       mapM_ (put . bswap 4) (maskBlockSizes sr)
  157       put (0::Word32)
  158       mapM_ put (packedDna sr)
  159 
  160 -- used to convert to/from sequence data in the Sequence data structure
  161 fromSR :: SR -> ByteString
  162 fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna)
  163     where
  164     low = combine (maskBlockStarts sr) (maskBlockSizes sr)
  165     ns  = combine (nBlockStarts sr) (nBlockSizes sr)
  166     combine starts lengths = concatMap (\(p,l) -> [p..p+l-1]) $ zip starts lengths
  167 
  168     dna = decodeDNA $ packedDna sr
  169     decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03])
  170     dec1 x = case x of 0 -> 'T'; 1 -> 'C'; 2 -> 'A'; 3 -> 'G'; _ -> error ("can't decode value "++show x)
  171 
  172     go (_,_,_,[]) = Nothing
  173     go (pos,(l:ls),(n:ns),(d:ds))
  174         | pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds))
  175         | pos == l             = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds))
  176         |             pos == n = Just ('N',(pos+1,l:ls,ns,ds))
  177         | otherwise            = Just (dec1 d, (pos+1,l:ls,n:ns,ds))
  178     go (pos,[],n:ns,d:ds)
  179         |             pos == n = Just ('N',(pos+1,[],ns,ds))
  180         | otherwise            = Just (dec1 d, (pos+1,[],n:ns,ds))
  181     go (pos,l:ls,[],d:ds)
  182         | pos == l             = Just (toLower (dec1 d),(pos+1,ls,[],ds))
  183         | otherwise            = Just (dec1 d, (pos+1,l:ls,[],ds))
  184     go (pos,[],[],d:ds)        = Just (dec1 d, (pos+1,[],[],ds))
  185 --    go x = error (show x)
  186 
  187 -- | Parse a (lazy) ByteString as sequences in the 2bit format.
  188 decode2Bit :: B.ByteString -> [Sequence]
  189 decode2Bit cs = let (Entries h es) = decode cs :: Entries
  190                     ms = map (fromIntegral . offset) es
  191                     (c:chunks) = zipWith (-) ms (0:ms)
  192                     ss = splits chunks $ B.drop c cs
  193                in map ($ Nothing) $ zipWith Seq (map name es)
  194                       $ map fromSR $ case swap h of
  195                                        True -> map (unSRLE.decode) ss
  196                                        False -> map (unSRBE.decode) ss
  197 
  198 splits [] cs = [cs]
  199 splits (e:es) cs = let (this,rest) = B.splitAt e cs
  200                    in this : splits es rest
  201 
  202 unSRBE (SRBE x) = x
  203 unSRLE (SRLE x) = x
  204 
  205 toSR :: ByteString -> SR
  206 toSR bs = undefined
  207 
  208 -- | Extract sequences from a file in 2bit format.
  209 read2Bit  :: FilePath -> IO [Sequence]
  210 read2Bit f = B.readFile f >>= return . decode2Bit
  211 
  212 -- | Extract sequences in the 2bit format from a handle.
  213 hRead2Bit :: Handle   -> IO [Sequence]
  214 hRead2Bit h = B.hGetContents h >>= return . decode2Bit
  215 
  216 -- | Write sequences to file in the 2bit format.
  217 write2Bit  :: FilePath -> [Sequence] -> IO ()
  218 write2Bit = undefined
  219 
  220 -- | Write sequences to a handle in the 2bit format.
  221 hWrite2Bit :: Handle   -> [Sequence] -> IO ()
  222 hWrite2Bit = undefined