module Bio.Bam.Reader (
Block(..),
decompressBgzfBlocks,
decompressBgzf,
compressBgzf,
decodeBam,
getBamRaw,
decodeAnyBam,
decodeAnyBamFile,
BamrawEnumeratee,
BamEnumeratee,
isBamOrSam,
isBam,
isPlainBam,
isGzipBam,
isBgzfBam,
decodeSam,
decodeSam',
decodeAnyBamOrSam,
decodeAnyBamOrSamFile,
concatInputs,
concatDefaultInputs,
mergeInputs,
mergeDefaultInputs,
combineCoordinates,
combineNames,
) where
import Bio.Base
import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Iteratee
import Bio.Iteratee.Bgzf
import Bio.Iteratee.ZLib hiding ( CompressionLevel )
import Control.Applicative
import Control.Arrow ( (&&&) )
import Control.Monad
import Data.Attoparsec.ByteString ( anyWord8 )
import Data.Char ( digitToInt )
import Data.Sequence ( (|>) )
import Data.String ( fromString )
import System.Environment ( getArgs )
import qualified Data.Attoparsec.ByteString.Char8 as P
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as S
import qualified Data.Foldable as F
import qualified Data.HashMap.Strict as M
import qualified Data.Sequence as Z
import qualified Data.Vector.Generic as V
import qualified Data.Vector.Storable as VS
import qualified Data.Vector.Unboxed as U
type ByteString = B.ByteString
type BamrawEnumeratee m b = Enumeratee' BamMeta S.ByteString [BamRaw] m b
type BamEnumeratee m b = Enumeratee' BamMeta ByteString [BamRec] m b
isBamOrSam :: MonadIO m => Iteratee ByteString m (BamEnumeratee m a)
isBamOrSam = maybe decodeSam wrap `liftM` isBam
where
wrap enee it' = enee (\hdr -> mapStream unpackBam (it' hdr)) >>= lift . run
decodeAnyBamOrSam :: MonadIO m => BamEnumeratee m a
decodeAnyBamOrSam it = isBamOrSam >>= \k -> k it
decodeAnyBamOrSamFile :: (MonadIO m, MonadMask m)
=> FilePath -> (BamMeta -> Iteratee [BamRec] m a) -> m (Iteratee [BamRec] m a)
decodeAnyBamOrSamFile fn k = enumFileRandom defaultBufSize fn (decodeAnyBamOrSam k) >>= run
decodeSam :: Monad m => (BamMeta -> Iteratee [BamRec] m a) -> Iteratee ByteString m (Iteratee [BamRec] m a)
decodeSam inner = joinI $ enumLinesBS $ do
let pHeaderLine acc str = case P.parseOnly parseBamMetaLine str of Right f -> return $ f : acc
Left e -> fail $ e ++ ", " ++ show str
meta <- liftM (foldr ($) mempty . reverse) (joinI $ breakE (not . S.isPrefixOf "@") $ foldStreamM pHeaderLine [])
decodeSamLoop (meta_refs meta) (inner meta)
decodeSamLoop :: Monad m => Refs -> Enumeratee [ByteString] [BamRec] m a
decodeSamLoop refs inner = convStream (liftI parse_record) inner
where !refs' = M.fromList $ zip [ nm | BamSQ { sq_name = nm } <- F.toList refs ] [toEnum 0..]
ref x = M.lookupDefault invalidRefseq x refs'
parse_record (EOF x) = icont parse_record x
parse_record (Chunk []) = liftI parse_record
parse_record (Chunk (l:ls)) | "@" `S.isPrefixOf` l = parse_record (Chunk ls)
parse_record (Chunk (l:ls)) = case P.parseOnly (parseSamRec ref) l of
Right r -> idone [r] (Chunk ls)
Left err -> icont parse_record (Just $ iterStrExc $ err ++ ", " ++ show l)
decodeSam' :: Monad m => Refs -> Enumeratee ByteString [BamRec] m a
decodeSam' refs inner = joinI $ enumLinesBS $ decodeSamLoop refs inner
parseSamRec :: (ByteString -> Refseq) -> P.Parser BamRec
parseSamRec ref = mkBamRec
<$> word <*> num <*> (ref <$> word) <*> (subtract 1 <$> num)
<*> (Q <$> num') <*> (VS.fromList <$> cigar) <*> rnext <*> (subtract 1 <$> num)
<*> snum <*> sequ <*> quals <*> exts <*> pure 0
where
sep = P.endOfInput <|> () <$ P.char '\t'
word = P.takeTill ((==) '\t') <* sep
num = P.decimal <* sep
num' = P.decimal <* sep
snum = P.signed P.decimal <* sep
rnext = id <$ P.char '=' <* sep <|> const . ref <$> word
sequ =
(V.empty <$ P.char '*' <|>
V.fromList . map toNucleotides . S.unpack <$> P.takeWhile is_nuc) <* sep
quals = defaultQs <$ P.char '*' <* sep <|> bsToVec <$> word
where
defaultQs sq = VS.replicate (V.length sq) (Q 0xff)
bsToVec qs _ = VS.fromList . map (Q . subtract 33) $ B.unpack qs
cigar = [] <$ P.char '*' <* sep <|>
P.manyTill (flip (:*) <$> P.decimal <*> cigop) sep
cigop = P.choice $ zipWith (\c r -> r <$ P.char c) "MIDNSHP" [Mat,Ins,Del,Nop,SMa,HMa,Pad]
exts = ext `P.sepBy` sep
ext = (\a b v -> (fromString [a,b],v)) <$> P.anyChar <*> P.anyChar <*> (P.char ':' *> value)
value = P.char 'A' *> P.char ':' *> (Char <$> anyWord8) <|>
P.char 'i' *> P.char ':' *> (Int <$> P.signed P.decimal) <|>
P.char 'Z' *> P.char ':' *> (Text <$> P.takeTill ((==) '\t')) <|>
P.char 'H' *> P.char ':' *> (Bin <$> hexarray) <|>
P.char 'f' *> P.char ':' *> (Float . realToFrac <$> P.double) <|>
P.char 'B' *> P.char ':' *> (
P.satisfy (P.inClass "cCsSiI") *> (intArr <$> many (P.char ',' *> P.signed P.decimal)) <|>
P.char 'f' *> (floatArr <$> many (P.char ',' *> P.double)))
intArr is = IntArr $ U.fromList is
floatArr fs = FloatArr $ U.fromList $ map realToFrac fs
hexarray = B.pack . repack . S.unpack <$> P.takeWhile (P.inClass "0-9A-Fa-f")
repack (a:b:cs) = fromIntegral (digitToInt a * 16 + digitToInt b) : repack cs ; repack _ = []
is_nuc = P.inClass "acgtswkmrybdhvnACGTSWKMRYBDHVN"
mkBamRec nm fl rn po mq cg rn' mp is sq qs' =
BamRec nm fl rn po mq cg (rn' rn) mp is sq (qs' sq)
isBam, isEmptyBam, isPlainBam, isBgzfBam, isGzipBam :: MonadIO m
=> Iteratee S.ByteString m (Maybe (BamrawEnumeratee m a))
isBam = firstOf [ isEmptyBam, isPlainBam, isBgzfBam, isGzipBam ]
where
firstOf [] = return Nothing
firstOf (k:ks) = iLookAhead k >>= maybe (firstOf ks) (return . Just)
isEmptyBam = (\e -> if e then Just (\k -> return $ k mempty) else Nothing) `liftM` isFinished
isPlainBam = (\n -> if n == 4 then Just (joinI . decompressPlain . decodeBam) else Nothing) `liftM` heads "BAM\SOH"
isBgzfBam = do b <- isBgzf
k <- if b then joinI $ enumInflate GZip defaultDecompressParams isPlainBam else return Nothing
return $ (\_ -> (joinI . decompressBgzfBlocks . decodeBam)) `fmap` k
isGzipBam = do b <- isGzip
k <- if b then joinI $ enumInflate GZip defaultDecompressParams isPlainBam else return Nothing
return $ ((joinI . enumInflate GZip defaultDecompressParams) .) `fmap` k
decodeAnyBam :: MonadIO m => BamrawEnumeratee m a
decodeAnyBam it = do mk <- isBam ; case mk of Just k -> k it
Nothing -> fail "this isn't BAM."
decodeAnyBamFile :: (MonadIO m, MonadMask m) => FilePath -> (BamMeta -> Iteratee [BamRaw] m a) -> m (Iteratee [BamRaw] m a)
decodeAnyBamFile fn k = enumFileRandom defaultBufSize fn (decodeAnyBam k) >>= run
concatDefaultInputs :: (MonadIO m, MonadMask m) => Enumerator' BamMeta [BamRaw] m a
concatDefaultInputs it0 = liftIO getArgs >>= \fs -> concatInputs fs it0
concatInputs :: (MonadIO m, MonadMask m) => [FilePath] -> Enumerator' BamMeta [BamRaw] m a
concatInputs [ ] = \k -> enumHandle defaultBufSize stdin (decodeAnyBam k) >>= run
concatInputs (fp0:fps0) = \k -> enum1 fp0 k >>= go fps0
where
enum1 "-" k1 = enumHandle defaultBufSize stdin (decodeAnyBam k1) >>= run
enum1 fp k1 = enumFile defaultBufSize fp (decodeAnyBam k1) >>= run
go [ ] = return
go (fp1:fps) = enum1 fp1 . const >=> go fps
mergeDefaultInputs :: (MonadIO m, MonadMask m)
=> (BamMeta -> Enumeratee [BamRaw] [BamRaw] (Iteratee [BamRaw] m) a)
-> Enumerator' BamMeta [BamRaw] m a
mergeDefaultInputs (?) it0 = liftIO getArgs >>= \fs -> mergeInputs (?) fs it0
mergeInputs :: (MonadIO m, MonadMask m)
=> (BamMeta -> Enumeratee [BamRaw] [BamRaw] (Iteratee [BamRaw] m) a)
-> [FilePath] -> Enumerator' BamMeta [BamRaw] m a
mergeInputs _ [ ] = \k -> enumHandle defaultBufSize stdin (decodeAnyBam k) >>= run
mergeInputs (?) (fp0:fps0) = go fp0 fps0
where
enum1 "-" k1 = enumHandle defaultBufSize stdin (decodeAnyBam k1) >>= run
enum1 fp k1 = enumFile defaultBufSize fp (decodeAnyBam k1) >>= run
go fp [ ] = enum1 fp
go fp (fp1:fps) = mergeEnums' (go fp1 fps) (enum1 fp) (?)
combineCoordinates :: Monad m => BamMeta -> Enumeratee [BamRaw] [BamRaw] (Iteratee [BamRaw] m) a
combineCoordinates _ = mergeSortStreams (?)
where u ? v = if (b_rname &&& b_pos) (unpackBam u) < (b_rname &&& b_pos) (unpackBam v) then Less else NotLess
combineNames :: Monad m => BamMeta -> Enumeratee [BamRaw] [BamRaw] (Iteratee [BamRaw] m) a
combineNames _ = mergeSortStreams (?)
where u ? v = case b_qname (unpackBam u) `compareNames` b_qname (unpackBam v) of LT -> Less ; _ -> NotLess
decodeBam :: Monad m => (BamMeta -> Iteratee [BamRaw] m a) -> Iteratee Block m (Iteratee [BamRaw] m a)
decodeBam inner = do meta <- liftBlock get_bam_header
refs <- liftBlock get_ref_array
convStream getBamRaw $ inner $! merge meta refs
where
get_bam_header = do magic <- heads "BAM\SOH"
when (magic /= 4) $ do s <- iGetString 10
fail $ "BAM signature not found: " ++ show magic ++ " " ++ show s
hdr_len <- endianRead4 LSB
joinI $ takeStream (fromIntegral hdr_len) $ parserToIteratee parseBamMeta
get_ref_array = do nref <- endianRead4 LSB
foldM (\acc _ -> do
nm <- endianRead4 LSB >>= iGetString . fromIntegral
ln <- endianRead4 LSB
return $! acc |> BamSQ (S.init nm) (fromIntegral ln) []
) Z.empty $ [1..nref]
merge meta refs =
let tbl = M.fromList [ (sq_name sq, sq) | sq <- F.toList (meta_refs meta) ]
in meta { meta_refs = fmap (\s -> maybe s (merge' s) (M.lookup (sq_name s) tbl)) refs }
merge' l r | sq_length l == sq_length r = l { sq_other_shit = sq_other_shit l ++ sq_other_shit r }
| otherwise = l
getBamRaw :: Monad m => Iteratee Block m [BamRaw]
getBamRaw = do off <- getOffset
raw <- liftBlock $ do
bsize <- endianRead4 LSB
when (bsize < 32) $ fail "short BAM record"
iGetString (fromIntegral bsize)
return [bamRaw off raw]