module Bio.Bam.Index (
BamIndex(..),
withIndexedBam,
readBamIndex,
readBaiIndex,
readTabix,
IndexFormatError(..),
Region(..),
Subsequence(..),
streamBamRefseq,
streamBamRegions,
streamBamSubseq,
streamBamUnaligned
) where
import Bio.Bam.Header
import Bio.Bam.Reader
import Bio.Bam.Rec
import Bio.Bam.Regions ( Region(..), Subsequence(..) )
import Bio.Prelude
import Bio.Streaming
import Bio.Streaming.Bgzf ( bgunzip )
import System.Directory ( doesFileExist )
import qualified Bio.Bam.Regions as R
import qualified Bio.Streaming.Bytes as S
import qualified Bio.Streaming.Parse as P
import qualified Data.IntMap.Strict as M
import qualified Data.ByteString as B
import qualified Data.Vector as V
import qualified Data.Vector.Mutable as W
import qualified Data.Vector.Unboxed as U
import qualified Data.Vector.Unboxed.Mutable as N
import qualified Data.Vector.Algorithms.Intro as N
import qualified Streaming.Prelude as Q
data BamIndex a = BamIndex {
minshift :: {-# UNPACK #-} !Int,
depth :: {-# UNPACK #-} !Int,
unaln_off :: {-# UNPACK #-} !Int64,
extensions :: a,
refseq_bins :: {-# UNPACK #-} !(V.Vector Bins),
refseq_ckpoints :: {-# UNPACK #-} !(V.Vector Ckpoints) }
deriving Show
type Bins = IntMap Segments
type Segments = U.Vector (Int64,Int64)
type Ckpoints = IntMap Int64
data Segment = Segment {-# UNPACK #-} !Int64 {-# UNPACK #-} !Int64 {-# UNPACK #-} !Int deriving Show
segmentLists :: BamIndex a -> Refseq -> R.Subsequence -> [[Segment]]
segmentLists bi@BamIndex{..} (Refseq ref) (R.Subsequence imap)
| Just bins <- refseq_bins V.!? fromIntegral ref,
Just cpts <- refseq_ckpoints V.!? fromIntegral ref
= [ rgnToSegments bi beg end bins cpts | (beg,end) <- M.toList imap ]
segmentLists _ _ _ = []
rgnToSegments :: BamIndex a -> Int -> Int -> Bins -> Ckpoints -> [Segment]
rgnToSegments bi@BamIndex{..} beg end bins cpts =
[ Segment boff' eoff end
| bin <- binList bi beg end
, (boff,eoff) <- maybe [] U.toList $ M.lookup bin bins
, let boff' = max boff cpt
, boff' < eoff ]
where
!cpt = maybe 0 snd $ M.lookupLE beg cpts
binList :: BamIndex a -> Int -> Int -> [Int]
binList BamIndex{..} beg end = binlist' 0 (minshift + 3*depth) 0
where
binlist' l s t = if l > depth then [] else [b..e] ++ go
where
b = t + beg `shiftR` s
e = t + (end-1) `shiftR` s
go = binlist' (l+1) (s-3) (t + 1 `shiftL` (3*l))
infix 4 ~~
(~~) :: [Segment] -> [Segment] -> [Segment]
Segment a b e : xs ~~ Segment u v f : ys
| b < u = Segment a b e : (xs ~~ Segment u v f : ys)
| a < u && b < v = Segment a v (max e f) : (xs ~~ ys)
| b < v = Segment u v (max e f) : (xs ~~ ys)
| v < a = Segment u v f : (xs ~~ Segment a b e : ys)
| u < a = Segment u b (max e f) : (xs ~~ ys)
| otherwise = Segment a b (max e f) : (xs ~~ ys)
[] ~~ ys = ys
xs ~~ [] = xs
data IndexFormatError = IndexFormatError Bytes deriving (Typeable, Show)
instance Exception IndexFormatError where
displayException (IndexFormatError m) = "index signature " ++ show m ++ " not recognized"
readBamIndex :: FilePath -> IO (BamIndex ())
readBamIndex fp1 | any (`isSuffixOf` fp1) exts = streamFile fp1 readBaiIndex
| otherwise = tryAll exts
where
exts = words ".bai .bai.gz .csi .csi.gz"
fp2 = reverse $ case dropWhile (/='.') f of [] -> f++d ; _:b -> b++d
(f,d) = break (=='/') $ reverse fp1
tryAll [ ] = streamFile fp1 readBaiIndex
tryAll (e:es) = do x1 <- liftIO $ doesFileExist (fp1 ++ e)
x2 <- liftIO $ doesFileExist (fp2 ++ e)
case () of
_ | x1 -> streamFile (fp1 ++ e) readBaiIndex
| x2 -> streamFile (fp2 ++ e) readBaiIndex
_ -> tryAll es
readBaiIndex :: MonadIO m => ByteStream m r -> m (BamIndex ())
readBaiIndex = either (const . liftIO $ throwM P.EofException) (return . fst) <=<
P.parseIO (const $ P.getString 4 >>= switch) . S.gunzip
where
switch "BAI\1" = do nref <- fromIntegral `liftM` P.getWord32
getIndexArrays nref 14 5 (const return) getIntervals
switch "CSI\1" = do minshift <- fromIntegral `liftM` P.getWord32
depth <- fromIntegral `liftM` P.getWord32
P.getWord32 >>= P.drop . fromIntegral
nref <- fromIntegral `liftM` P.getWord32
getIndexArrays nref minshift depth (addOneCheckpoint minshift depth) return
switch magic = throwM $ IndexFormatError magic
addOneCheckpoint minshift depth bin cp = do
loffset <- fromIntegral `liftM` P.getWord64
let key = llim (fromIntegral bin) (3*depth) minshift
return $! M.insertWith min key loffset cp
llim bin dp sf | dp == 0 = 0
| bin >= ix = (bin - ix) `shiftL` sf
| otherwise = llim bin (dp-3) (sf+3)
where ix = (1 `shiftL` dp - 1) `div` 7
withIndexedBam :: (MonadIO m, MonadLog m, MonadMask m) => FilePath -> (BamMeta -> BamIndex () -> Handle -> m r) -> m r
withIndexedBam f k = do
idx <- liftIO $ readBamIndex f
bracket (liftIO $ openBinaryFile f ReadMode) (liftIO . hClose) $ \hdl -> do
(hdr,_) <- decodeBam $ streamHandle hdl
k hdr idx hdl
type TabIndex = BamIndex TabMeta
data TabMeta = TabMeta { format :: TabFormat
, col_seq :: Int
, col_beg :: Int
, col_end :: Int
, comment_char :: Char
, skip_lines :: Int
, names :: V.Vector Bytes }
deriving Show
data TabFormat = Generic | SamFormat | VcfFormat | ZeroBased deriving Show
readTabix :: MonadIO m => ByteStream m r -> m TabIndex
readTabix = either (const . liftIO $ throwM P.EofException) (return . fst) <=<
P.parseIO (const $ P.getString 4 >>= switch) . S.gunzip
where
switch "TBI\1" = do nref <- fromIntegral `liftM` P.getWord32
format <- liftM toFormat P.getWord32
col_seq <- liftM fromIntegral P.getWord32
col_beg <- liftM fromIntegral P.getWord32
col_end <- liftM fromIntegral P.getWord32
comment_char <- liftM (chr . fromIntegral) P.getWord32
skip_lines <- liftM fromIntegral P.getWord32
names <- liftM (V.fromList . B.split 0) . P.getString . fromIntegral =<< P.getWord32
ix <- getIndexArrays nref 14 5 (const return) getIntervals
fin <- P.isFinished
if fin then return $! ix { extensions = TabMeta{..} }
else do unaln <- fromIntegral `liftM` P.getWord64
return $! ix { unaln_off = unaln, extensions = TabMeta{..} }
switch magic = throwM $ IndexFormatError magic
toFormat 1 = SamFormat
toFormat 2 = VcfFormat
toFormat x = if testBit x 16 then ZeroBased else Generic
getIntervals :: Monad m => (IntMap Int64, Int64) -> P.Parser r m (IntMap Int64, Int64)
getIntervals (cp,mx0) = do
nintv <- fromIntegral `liftM` P.getWord32
reduceM 0 nintv (cp,mx0) $ \(!im,!mx) int -> do
oo <- fromIntegral `liftM` P.getWord64
return (if oo == 0 then im else M.insert (int * 0x4000) oo im, max mx oo)
getIndexArrays :: MonadIO m => Int -> Int -> Int
-> (Word32 -> Ckpoints -> P.Parser r m Ckpoints)
-> ((Ckpoints, Int64) -> P.Parser r m (Ckpoints, Int64))
-> P.Parser r m (BamIndex ())
getIndexArrays nref minshift depth addOneCheckpoint addManyCheckpoints
| nref < 1 = return $ BamIndex minshift depth 0 () V.empty V.empty
| otherwise = do
rbins <- liftIO $ W.new nref
rckpts <- liftIO $ W.new nref
mxR <- reduceM 0 nref 0 $ \mx0 r -> do
nbins <- P.getWord32
(!bins,!cpts,!mx1) <- reduceM 0 nbins (M.empty,M.empty,mx0) $ \(!im,!cp,!mx) _ -> do
bin <- P.getWord32
cp' <- addOneCheckpoint bin cp
segsarr <- getSegmentArray
let !mx' = if U.null segsarr then mx else max mx (snd (U.last segsarr))
return (M.insert (fromIntegral bin) segsarr im, cp', mx')
(!cpts',!mx2) <- addManyCheckpoints (cpts,mx1)
liftIO $ W.write rbins r bins >> W.write rckpts r cpts'
return mx2
liftM2 (BamIndex minshift depth mxR ()) (liftIO $ V.unsafeFreeze rbins) (liftIO $ V.unsafeFreeze rckpts)
getSegmentArray :: MonadIO m => P.Parser r m Segments
getSegmentArray = do
nsegs <- fromIntegral `liftM` P.getWord32
segsarr <- liftIO $ N.new nsegs
loopM 0 nsegs $ \i -> do beg <- fromIntegral `liftM` P.getWord64
end <- fromIntegral `liftM` P.getWord64
liftIO $ N.write segsarr i (beg,end)
liftIO $ N.sort segsarr >> U.unsafeFreeze segsarr
{-# INLINE reduceM #-}
reduceM :: (Monad m, Enum ix, Eq ix) => ix -> ix -> a -> (a -> ix -> m a) -> m a
reduceM beg end acc cons = if beg /= end then cons acc beg >>= \n -> reduceM (succ beg) end n cons else return acc
{-# INLINE loopM #-}
loopM :: (Monad m, Enum ix, Eq ix) => ix -> ix -> (ix -> m ()) -> m ()
loopM beg end k = if beg /= end then k beg >> loopM (succ beg) end k else return ()
streamBgzf :: MonadIO m => Handle -> Int64 -> Maybe Int64 -> ByteStream m ()
streamBgzf hdl off eoff =
S.drop (off .&. 0xffff) . bgunzip $ do
when (off /= 0) (liftIO $ hSeek hdl AbsoluteSeek $ fromIntegral $ shiftR off 16)
maybe id S.trim eoff $ streamHandle hdl
{-# INLINE streamBgzf #-}
streamBamRefseq :: (MonadIO m, MonadLog m) => BamIndex b -> Handle -> Refseq -> Stream (Of BamRaw) m ()
streamBamRefseq BamIndex{..} hdl (Refseq r)
| Just ckpts <- refseq_ckpoints V.!? fromIntegral r
, Just (voff, _) <- M.minView ckpts
, voff /= 0 = void $
Q.takeWhile ((Refseq r ==) . b_rname . unpackBam) $
Q.unfoldr (P.parseLog Error getBamRaw) $
streamBgzf hdl voff Nothing
| otherwise = pure ()
streamBamUnaligned :: MonadIO m => BamIndex b -> Handle -> Stream (Of BamRaw) m ()
streamBamUnaligned BamIndex{..} hdl =
Q.filter (not . isValidRefseq . b_rname . unpackBam) $
Q.unfoldr (P.parseIO getBamRaw) $
streamBgzf hdl unaln_off Nothing
streamBamSegment :: MonadIO m
=> Handle -> Segment -> Stream (Of BamRaw) m ()
-> Stream (Of BamRaw) m (Stream (Of BamRaw) m ())
streamBamSegment hdl (Segment beg end mpos) =
lift . Q.uncons >=> \case
Just (br,brs) | near (virt_offset br)
-> Q.span in_seg $ Q.cons br brs
_ -> Q.span in_seg $ Q.unfoldr (P.parseIO getBamRaw) $ streamBgzf hdl beg (Just end)
where
near o = beg <= fromIntegral o && beg + 0x800000000 > fromIntegral o
in_seg br = virt_offset br <= end && b_pos (unpackBam br) <= mpos
streamBamSubseq :: MonadIO m
=> BamIndex b -> Handle -> Refseq -> R.Subsequence -> Stream (Of BamRaw) m ()
-> Stream (Of BamRaw) m (Stream (Of BamRaw) m ())
streamBamSubseq bi hdl ref subs str = Q.filter olap $ foldM (flip $ streamBamSegment hdl) str segs
where
segs = foldr (~~) [] $ segmentLists bi ref subs
olap br = case unpackBam br of
BamRec{..} -> b_rname == ref && R.overlaps b_pos (b_pos + alignedLength b_cigar) subs
streamBamRegions :: MonadIO m => BamIndex b -> Handle -> [R.Region] -> Stream (Of BamRaw) m ()
streamBamRegions bi hdl = void . foldM (\s (r,is) -> streamBamSubseq bi hdl r is s) (pure ()) . R.toList . R.fromList