1 {- | 2 This module implements the 2bit format for sequences. 3 4 Based on: <http://genome.ucsc.edu/FAQ/FAQformat#format7> 5 Note! the description is not accurate, it is missing a reserved word 6 in each sequence record. 7 8 There are also other, completely different ideas of the 2bit format, e.g. 9 <http://jcomeau.freeshell.org/www/genome/2bitformat.html> 10 -} 11 12 13 module Bio.Sequence.TwoBit 14 ( decode2Bit, read2Bit, hRead2Bit 15 ) where 16 17 import Bio.Sequence.SeqData 18 import qualified Data.ByteString.Lazy as BB 19 import qualified Data.ByteString.Lazy.Char8 as B 20 21 import System.IO 22 import Control.Monad 23 24 import Data.Char 25 import Data.Binary 26 import Data.Int 27 import Data.List 28 import Data.Bits 29 30 import Test.QuickCheck hiding (check) -- QC 1.0 31 -- import Test.QuickCheck hiding ((.&.)) -- QC 2.0 32 33 type ByteString = B.ByteString 34 35 -- constants 36 default_magic, default_version :: Word32 37 default_magic = 0x1A412743 38 default_version = 0 39 40 -- binary extras 41 check :: Monad m => (a -> Bool) -> a -> m a 42 check p x = if (p x) then return x else fail "check failed" 43 44 bswap :: Integral a => Int -> a -> a 45 bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (n-length s) 0 46 47 bytes :: Integral a => a -> [Word8] 48 bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256 49 in if q == 0 && r == 0 then Nothing 50 else Just (fromIntegral r,q)) 51 unbytes :: Integral a => [Word8] -> a 52 unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral 53 54 instance Arbitrary Word8 where 55 arbitrary = choose (0,255::Int) >>= return . fromIntegral 56 57 prop_bswap :: Word8 -> Word8 -> Word8 -> Word8 -> Bool 58 prop_bswap a1 a2 a3 a4 = (bswap 4 . decode . BB.pack) [a1,a2,a3,a4] == ((decode . BB.pack) [a4,a3,a2,a1] :: Word32) 59 60 -- basic data types 61 data Header = Header { swap :: Bool, version, count, reserved :: Word32 } 62 instance Show Header where show (Header _ v c r) = "H "++show (v,c,r) 63 64 instance Binary Header where 65 get = do 66 m <- get 67 v <- get >>= check (==default_version) 68 c <- get 69 r <- get 70 let s = if m==default_magic then id else if m==bswap 4 default_magic then bswap 4 71 else error "2bit decode: incorrect magic number" 72 return (Header (m/=default_magic) (s v) (s c) (s r)) 73 put (Header m v c r) = do 74 put default_magic 75 put default_version 76 put c 77 put (0 :: Word32) 78 79 data Entry = Entry { name :: ByteString, offset :: Word32 } 80 deriving Show 81 82 swapEntry e = e { offset = bswap 4 (offset e) } 83 84 instance Binary Entry where 85 get = do 86 len <- getWord8 87 name <- replicateM (fromIntegral len) getWord8 88 offset <- get 89 return (Entry (BB.pack name) offset) 90 put (Entry bs offset) = do 91 let l = fromIntegral $ B.length bs :: Word8 92 put l 93 mapM_ put $ BB.unpack bs 94 put offset 95 96 data Entries = Entries Header [Entry] 97 instance Show Entries where show (Entries h es) = unlines (show h : map show es) 98 99 instance Binary Entries where 100 get = do 101 h <- get 102 es <- replicateM (fromIntegral $ count h) get 103 return (Entries h $ if swap h then map swapEntry es else es) 104 105 {- 106 dnaSize - number of bases of DNA in the sequence 107 nBlockCount - the number of blocks of Ns in the file (representing unknown sequence) 108 nBlockStarts - the starting position for each block of Ns 109 nBlockSizes - the size of each block of Ns 110 maskBlockCount - the number of masked (lower-case) blocks 111 maskBlockStarts - the starting position for each masked block 112 maskBlockSizes - the size of each masked block 113 packedDna - the DNA packed to two bits per base 114 -} 115 116 data SR = SR { dnaSize, nBlockCount :: Word32 117 , nBlockStarts, nBlockSizes :: [Word32] 118 , maskBlockCount :: Word32 119 , maskBlockStarts, maskBlockSizes :: [Word32] 120 , packedDna :: [Word8] 121 , reserved2 :: Word32 } 122 deriving Show 123 124 -- big- and little-endian variants (what a mess) 125 newtype SRBE = SRBE SR deriving Show 126 newtype SRLE = SRLE SR deriving Show 127 128 instance Binary SRBE where 129 get = undefined 130 131 instance Binary SRLE where 132 get = do 133 dz <- get :: Get Word32 134 nc <- get :: Get Word32 135 let n = fromIntegral $ bswap 4 nc 136 nbs <- replicateM n get 137 nbsz <- replicateM n get 138 mc <- get :: Get Word32 139 let m = fromIntegral $ bswap 4 mc 140 mbs <- replicateM m get 141 mbsz <- replicateM m get 142 _reserved <- get :: Get Word32 -- !!!! oops? 143 let d = fromIntegral $ bswap 4 dz 144 pdna <- replicateM ((d+3) `div` 4) get 145 return (SRLE $ SR (bswap 4 dz) (bswap 4 nc) 146 (map (bswap 4) nbs) (map (bswap 4) nbsz) 147 (bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved)) 148 -- should this happen? Why not just write default format? 149 put (SRLE sr) = do 150 put (bswap 4 $ dnaSize sr) 151 put (bswap 4 $ nBlockCount sr) 152 mapM_ (put . bswap 4) (nBlockStarts sr) 153 mapM_ (put . bswap 4) (nBlockSizes sr) 154 put (bswap 4 $ maskBlockCount sr) 155 mapM_ (put . bswap 4) (maskBlockStarts sr) 156 mapM_ (put . bswap 4) (maskBlockSizes sr) 157 put (0::Word32) 158 mapM_ put (packedDna sr) 159 160 -- used to convert to/from sequence data in the Sequence data structure 161 fromSR :: SR -> ByteString 162 fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna) 163 where 164 low = combine (maskBlockStarts sr) (maskBlockSizes sr) 165 ns = combine (nBlockStarts sr) (nBlockSizes sr) 166 combine starts lengths = concatMap (\(p,l) -> [p..p+l-1]) $ zip starts lengths 167 168 dna = decodeDNA $ packedDna sr 169 decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03]) 170 dec1 x = case x of 0 -> 'T'; 1 -> 'C'; 2 -> 'A'; 3 -> 'G'; _ -> error ("can't decode value "++show x) 171 172 go (_,_,_,[]) = Nothing 173 go (pos,(l:ls),(n:ns),(d:ds)) 174 | pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds)) 175 | pos == l = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds)) 176 | pos == n = Just ('N',(pos+1,l:ls,ns,ds)) 177 | otherwise = Just (dec1 d, (pos+1,l:ls,n:ns,ds)) 178 go (pos,[],n:ns,d:ds) 179 | pos == n = Just ('N',(pos+1,[],ns,ds)) 180 | otherwise = Just (dec1 d, (pos+1,[],n:ns,ds)) 181 go (pos,l:ls,[],d:ds) 182 | pos == l = Just (toLower (dec1 d),(pos+1,ls,[],ds)) 183 | otherwise = Just (dec1 d, (pos+1,l:ls,[],ds)) 184 go (pos,[],[],d:ds) = Just (dec1 d, (pos+1,[],[],ds)) 185 -- go x = error (show x) 186 187 -- | Parse a (lazy) ByteString as sequences in the 2bit format. 188 decode2Bit :: B.ByteString -> [Sequence] 189 decode2Bit cs = let (Entries h es) = decode cs :: Entries 190 ms = map (fromIntegral . offset) es 191 (c:chunks) = zipWith (-) ms (0:ms) 192 ss = splits chunks $ B.drop c cs 193 in map ($ Nothing) $ zipWith Seq (map name es) 194 $ map fromSR $ case swap h of 195 True -> map (unSRLE.decode) ss 196 False -> map (unSRBE.decode) ss 197 198 splits [] cs = [cs] 199 splits (e:es) cs = let (this,rest) = B.splitAt e cs 200 in this : splits es rest 201 202 unSRBE (SRBE x) = x 203 unSRLE (SRLE x) = x 204 205 toSR :: ByteString -> SR 206 toSR bs = undefined 207 208 -- | Extract sequences from a file in 2bit format. 209 read2Bit :: FilePath -> IO [Sequence] 210 read2Bit f = B.readFile f >>= return . decode2Bit 211 212 -- | Extract sequences in the 2bit format from a handle. 213 hRead2Bit :: Handle -> IO [Sequence] 214 hRead2Bit h = B.hGetContents h >>= return . decode2Bit 215 216 -- | Write sequences to file in the 2bit format. 217 write2Bit :: FilePath -> [Sequence] -> IO () 218 write2Bit = undefined 219 220 -- | Write sequences to a handle in the 2bit format. 221 hWrite2Bit :: Handle -> [Sequence] -> IO () 222 hWrite2Bit = undefined