module Bio.Sequence.TwoBit
( decode2Bit,
read2Bit,
hRead2Bit,
encode2Bit,
write2Bit,
hWrite2Bit
) where
import Bio.Sequence.SeqData
import qualified Data.ByteString.Lazy as BB
import qualified Data.ByteString.Lazy.Char8 as B
import System.IO
import Control.Monad
import Data.Char
import Data.Binary
import Data.Int
import Data.List
import Data.Bits
import Test.QuickCheck hiding ((.&.))
default_magic, default_version :: Word32
default_magic = 0x1A412743
default_version = 0
check :: Monad m => (a -> Bool) -> a -> m a
check p x = if (p x) then return x else fail "check failed"
bswap :: Integral a => Int -> a -> a
bswap n x = let s = bytes x in unbytes . reverse $ s ++ replicate (nlength s) 0
bytes :: Integral a => a -> [Word8]
bytes = Data.List.unfoldr (\w -> let (q,r) = quotRem w 256
in if q == 0 && r == 0 then Nothing
else Just (fromIntegral r,q))
unbytes :: Integral a => [Word8] -> a
unbytes = Data.List.foldr (\x y -> y*256+x) 0 . map fromIntegral
data Header = Header { swap :: Bool, version, count, reserved :: Word32 }
instance Show Header where show (Header _ v c r) = "H "++show (v,c,r)
instance Binary Header where
get = do
m <- get
v <- get >>= check (==default_version)
c <- get
r <- get
let s = if m == default_magic
then id
else if m == bswap 4 default_magic
then bswap 4
else error "2bit decode: incorrect magic number"
return (Header (m /= default_magic) (s v) (s c) (s r))
put (Header m v c r) = do
put default_magic
put default_version
put c
put (0 :: Word32)
data Entry = Entry { name :: B.ByteString, offset :: Word32 }
deriving Show
swapEntry :: Entry -> Entry
swapEntry entry = entry { offset = bswap 4 (offset entry) }
instance Binary Entry where
get = do
len <- getWord8
name <- replicateM (fromIntegral len) getWord8
offset <- get
return (Entry (BB.pack name) offset)
put (Entry byteString offset) = do
let len = fromIntegral $ B.length byteString :: Word8
put len
mapM_ put $ BB.unpack byteString
put offset
data Entries = Entries Header [Entry]
instance Show Entries
where show (Entries h es) = unlines (show h : map show es)
instance Binary Entries where
get = do
h <- get
es <- replicateM (fromIntegral $ count h) get
return (Entries h $ if swap h then map swapEntry es else es)
put (Entries h es) = do
put h
put es
data SR = SR { dnaSize :: Word32,
nBlockCount :: Word32,
nBlockStarts :: [Word32],
nBlockSizes :: [Word32],
maskBlockCount :: Word32,
maskBlockStarts :: [Word32],
maskBlockSizes :: [Word32],
packedDna :: [Word8],
reserved2 :: Word32 }
deriving Show
newtype SRBE = SRBE SR deriving Show
newtype SRLE = SRLE SR deriving Show
instance Binary SRBE where
get = do
dz <- get :: Get Word32
nc <- get :: Get Word32
let n = fromIntegral nc
nbs <- replicateM n get
nbsz <- replicateM n get
mc <- get :: Get Word32
let m = fromIntegral mc
mbs <- replicateM m get
mbsz <- replicateM m get
_reserved <- get :: Get Word32
let d = fromIntegral dz
pdna <- replicateM ((d+3) `div` 4) get
return (SRBE $ SR dz nc nbs nbsz
mc mbs mbsz pdna _reserved)
put (SRBE sr) = do
put $ dnaSize sr
put $ nBlockCount sr
mapM_ put (nBlockStarts sr)
mapM_ put (nBlockSizes sr)
put $ maskBlockCount sr
mapM_ put (maskBlockStarts sr)
mapM_ put (maskBlockSizes sr)
put (0::Word32)
mapM_ put (packedDna sr)
instance Binary SRLE where
get = do
dz <- get :: Get Word32
nc <- get :: Get Word32
let n = fromIntegral $ bswap 4 nc
nbs <- replicateM n get
nbsz <- replicateM n get
mc <- get :: Get Word32
let m = fromIntegral $ bswap 4 mc
mbs <- replicateM m get
mbsz <- replicateM m get
_reserved <- get :: Get Word32
let d = fromIntegral $ bswap 4 dz
pdna <- replicateM ((d+3) `div` 4) get
return (SRLE $ SR (bswap 4 dz) (bswap 4 nc)
(map (bswap 4) nbs) (map (bswap 4) nbsz)
(bswap 4 mc) (map (bswap 4) mbs) (map (bswap 4) mbsz) pdna (bswap 4 _reserved))
put (SRLE sr) = do
put (bswap 4 $ dnaSize sr)
put (bswap 4 $ nBlockCount sr)
mapM_ (put . bswap 4) (nBlockStarts sr)
mapM_ (put . bswap 4) (nBlockSizes sr)
put (bswap 4 $ maskBlockCount sr)
mapM_ (put . bswap 4) (maskBlockStarts sr)
mapM_ (put . bswap 4) (maskBlockSizes sr)
put (0::Word32)
mapM_ put (packedDna sr)
fromSR :: SR -> B.ByteString
fromSR sr = B.unfoldr go (0,low,ns,take (fromIntegral $ dnaSize sr) dna)
where
low = combine (maskBlockStarts sr) (maskBlockSizes sr)
ns = combine (nBlockStarts sr) (nBlockSizes sr)
combine :: (Num t, Enum t) => [t] -> [t] -> [t]
combine starts lengths = concatMap (\(p,l) -> [p..p+l1]) $ zip starts lengths
dna = decodeDNA $ packedDna sr
decodeDNA = concatMap (\x -> [shiftR (x .&. 0xC0) 6, shiftR (x .&. 0x30) 4, shiftR (x .&. 0x0C) 2, x .&. 0x03])
dec1 :: (Num t) => t -> Char
dec1 x = case x of
0 -> 'T';
1 -> 'C';
2 -> 'A';
3 -> 'G';
_ -> error ("can't decode value "++show x)
go :: (Num a, Num t) => (a, [a], [a], [t]) -> Maybe (Char, (a, [a], [a], [t]))
go (_,_,_,[]) = Nothing
go (pos,(l:ls),(n:ns),(d:ds))
| pos == l && pos == n = Just ('n',(pos+1,ls,ns,ds))
| pos == l = Just (toLower (dec1 d),(pos+1,ls,n:ns,ds))
| pos == n = Just ('N',(pos+1,l:ls,ns,ds))
| otherwise = Just (dec1 d, (pos+1,l:ls,n:ns,ds))
go (pos,[],n:ns,d:ds)
| pos == n = Just ('N',(pos+1,[],ns,ds))
| otherwise = Just (dec1 d, (pos+1,[],n:ns,ds))
go (pos,l:ls,[],d:ds)
| pos == l = Just (toLower (dec1 d),(pos+1,ls,[],ds))
| otherwise = Just (dec1 d, (pos+1,l:ls,[],ds))
go (pos,[],[],d:ds) = Just (dec1 d, (pos+1,[],[],ds))
toSR :: B.ByteString -> SR
toSR bs = undefined
splits :: [Int64] -> B.ByteString -> [B.ByteString]
splits [] cs = [cs]
splits (e:es) cs = let (this,rest) = B.splitAt e cs
in this : splits es rest
decode2Bit :: B.ByteString -> [Sequence Unknown]
decode2Bit cs = let
(Entries h es) = decode cs :: Entries
ms = map (fromIntegral . offset) es
(c:chunks) = zipWith () ms (0:ms)
ss = splits chunks $ B.drop c cs
in map ($ Nothing) $ zipWith Seq (map name es) $ map fromSR $ case swap h of
True -> map (unSRLE.decode) ss
False -> map (unSRBE.decode) ss
type SequenceLabel = SeqData
type SequenceSize = Offset
type SequenceData = SeqData
type TwoBitData = (SequenceLabel, SequenceSize, SequenceData)
encode2Bit :: [Sequence a] -> B.ByteString
encode2Bit ss = let
buildHeader :: [Sequence a] -> Header
buildHeader ss = Header {swap = True,
version = default_version,
count = sequenceListLength ss,
reserved = 0}
buildEntries :: [TwoBitData] -> Offset -> [Entry]
buildEntries [] _ = []
buildEntries ((label, length, _):xs) currentOffset = Entry {name=label,
offset=fromIntegral $ currentOffset} : buildEntries xs (currentOffset+length)
buildSR :: TwoBitData -> SR
buildSR (_, size, dnaData) = SR {dnaSize = (fromIntegral size),
nBlockCount = 0,
nBlockStarts = [],
nBlockSizes = [],
maskBlockCount = 0,
maskBlockStarts = [],
maskBlockSizes = [],
packedDna = encodeDNA $ splitWord8 $ explode dnaData,
reserved2 = 0}
sequenceListLength :: [Sequence a] -> Word32
sequenceListLength [] = 0
sequenceListLength (s:ss) = 1 + sequenceListLength ss
sequenceListExtract :: [Sequence a] -> [TwoBitData]
sequenceListExtract ss = map (\seq -> (seqlabel seq, seqlength seq, seqdata seq)) ss
enc1 :: (Num t) => Char -> t
enc1 c = case c of
'T' -> 0;
'C' -> 1;
'A' -> 2;
'G' -> 3;
explode :: SeqData -> [Word8]
explode seq = map enc1 $ B.unpack seq
splitWord8 :: [Word8] -> [[Word8]]
splitWord8 [] = []
splitWord8 cs = let (this,rest) = splitAt 4 cs
in this : splitWord8 rest
encodeDNA :: [[Word8]] -> [Word8]
encodeDNA [] = []
encodeDNA (w:ws) = (pack2Bit $ getQuad) : encodeDNA ws
where
len = length w
pack2Bit :: (Word8, Word8, Word8, Word8) -> Word8
pack2Bit (d1, d2, d3, d4) = shiftL (d1) 6 .|. shiftL (d2) 4 .|. shiftL (d3) 2 .|. d4
getQuad :: (Word8, Word8, Word8, Word8)
getQuad = case len of
1 -> (w !! 0, 0, 0, 0);
2 -> (w !! 0, w !! 1, 0, 0);
3 -> (w !! 0, w !! 1, w !! 2, 0);
4 -> (w !! 0, w !! 1, w !! 2, w !! 3);
in
B.append (encode (buildHeader ss))
(B.append
(B.concat (map encode (buildEntries (sequenceListExtract ss) 56)))
(B.concat (map (encode . SRBE) (map buildSR (sequenceListExtract ss)))))
unSRBE :: SRBE -> SR
unSRBE (SRBE x) = x
unSRLE :: SRLE -> SR
unSRLE (SRLE x) = x
read2Bit :: FilePath -> IO [Sequence Unknown]
read2Bit f = B.readFile f >>= return . decode2Bit
hRead2Bit :: Handle -> IO [Sequence Unknown]
hRead2Bit h = B.hGetContents h >>= return . decode2Bit
write2Bit :: FilePath -> [Sequence a] -> IO ()
write2Bit f seq = do
let byteString = encode2Bit seq
B.writeFile f byteString
return ()
hWrite2Bit :: Handle -> [Sequence a] -> IO ()
hWrite2Bit h seq = do
let byteString = encode2Bit seq
B.hPut h byteString
return ()