{-# LANGUAGE UndecidableInstances #-}
module Bio.Bam.Header (
BamMeta(..),
parseBamMeta,
showBamMeta,
addPG,
BamKey(..),
BamHeader(..),
BamSQ(..),
BamSorting(..),
BamOtherShit,
Refseq(..),
invalidRefseq,
isValidRefseq,
invalidPos,
isValidPos,
unknownMapq,
isKnownMapq,
Refs(..),
getRef,
compareNames,
flagPaired,
flagProperlyPaired,
flagUnmapped,
flagMateUnmapped,
flagReversed,
flagMateReversed,
flagFirstMate,
flagSecondMate,
flagAuxillary,
flagSecondary,
flagFailsQC,
flagDuplicate,
flagSupplementary,
eflagTrimmed,
eflagMerged,
eflagAlternative,
eflagExactIndex,
distinctBin,
MdOp(..),
readMd,
showMd
) where
import Bio.Prelude hiding ( uncons )
import Bio.Util.Nub
import Control.Monad.Trans.RWS
import Data.ByteString ( uncons )
import Data.ByteString.Builder ( Builder, byteString, char7, intDec, word16LE )
import qualified Data.Attoparsec.ByteString.Char8 as P
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as S
import qualified Data.HashMap.Strict as H
import qualified Data.Vector as V
data BamMeta = BamMeta {
meta_hdr :: !BamHeader,
meta_refs :: !Refs,
meta_pgs :: [Fix BamPG],
meta_other_shit :: [(BamKey, BamOtherShit)],
meta_comment :: [Bytes]
} deriving ( Show, Generic )
newtype BamKey = BamKey Word16
deriving ( Eq, Ord, Hashable, Generic )
instance IsString BamKey where
{-# INLINE fromString #-}
fromString [a,b]
| ord a < 256 && ord b < 256
= BamKey . fromIntegral $ ord a .|. shiftL (ord b) 8
fromString s
= error $ "Not a legal BAM key: " ++ show s
instance Show BamKey where
show (BamKey a) = [ chr (fromIntegral a .&. 0xff), chr (shiftR (fromIntegral a) 8 .&. 0xff) ]
addPG :: MonadIO m => Maybe Version -> m (BamMeta -> BamMeta)
addPG vn = liftIO $ do
args <- getArgs
pn <- getProgName
let more = ("PN", S.pack pn) :
("CL", S.pack $ unwords args) :
maybe [] (\v -> [("VN",S.pack (showVersion v))]) vn
return $ \bm -> case meta_pgs bm of
[ ] -> bm { meta_pgs = Fix (BamPG (S.pack pn) Nothing more) : [ ] }
pg:pgs -> bm { meta_pgs = Fix (BamPG (S.pack pn) (Just pg) more) : pgs }
instance Semigroup BamMeta where (<>) = combineBamMeta
instance Monoid BamMeta where mempty = BamMeta mempty mempty mempty [] []
mappend = (<>)
combineBamMeta :: BamMeta -> BamMeta -> BamMeta
combineBamMeta a b = BamMeta
{ meta_hdr = meta_hdr a <> meta_hdr b
, meta_refs = meta_refs a `mappend` meta_refs b
, meta_pgs = meta_pgs a <> meta_pgs b
, meta_other_shit = nubHash $ meta_other_shit a ++ meta_other_shit b
, meta_comment = nubHash $ meta_comment a ++ meta_comment b }
data BamHeader = BamHeader {
hdr_version :: (Int, Int),
hdr_sorting :: BamSorting,
hdr_other_shit :: BamOtherShit
} deriving (Show, Eq)
instance Monoid BamHeader where
mempty = BamHeader (1,0) Unknown []
mappend = (<>)
instance Semigroup BamHeader where
a <> b = BamHeader { hdr_version = max (hdr_version a) (hdr_version b)
, hdr_sorting = hdr_sorting a <> hdr_sorting b
, hdr_other_shit = nubHashBy fst $ hdr_other_shit a ++ hdr_other_shit b }
data BamSQ = BamSQ {
sq_name :: Bytes,
sq_length :: Int,
sq_other_shit :: BamOtherShit
} deriving (Show, Eq, Generic)
instance Hashable BamSQ
data BamPG pp = BamPG {
pg_pref_name :: Bytes,
pg_prev_pg :: Maybe pp,
pg_other_shit :: BamOtherShit
} deriving (Show, Eq, Generic1)
newtype Fix f = Fix (f (Fix f))
instance Eq (f (Fix f)) => Eq (Fix f) where
Fix f == Fix g = f == g
instance Show (f (Fix f)) => Show (Fix f) where
showsPrec p (Fix f) = showsPrec p f
instance Hashable (Fix BamPG) where
hashWithSalt s (Fix (BamPG n Nothing o)) = hashWithSalt (hashWithSalt s n) o
hashWithSalt s (Fix (BamPG n (Just p) o)) = hashWithSalt (hashWithSalt (hashWithSalt s n) p) o
data BamSorting = Unknown
| Unsorted
| Grouped
| Queryname
| Coordinate
deriving (Show, Eq)
instance Semigroup BamSorting where
Unknown <> b = b
a <> Unknown = a
Grouped <> Grouped = Grouped
Grouped <> Queryname = Grouped
Queryname <> Grouped = Grouped
Queryname <> Queryname = Queryname
Coordinate <> Coordinate = Coordinate
_ <> _ = Unsorted
type BamOtherShit = [(BamKey, Bytes)]
parseBamMeta :: P.Parser BamMeta
parseBamMeta = fixupMeta . foldl' (flip ($)) emptyHeader
<$> many (parseBamMetaLine <* P.skipWhile (=='\t') <* P.char '\n') <* P.endOfInput
data PreBamMeta = PreBamMeta {
pmeta_hdr :: BamHeader,
pmeta_refs :: [BamSQ],
pmeta_pgs :: HashMap Bytes (BamPG Bytes),
pmeta_other_shit :: [(BamKey, BamOtherShit)],
pmeta_comment :: [Bytes] }
emptyHeader :: PreBamMeta
emptyHeader = PreBamMeta mempty [] H.empty [] []
fixupMeta :: PreBamMeta -> BamMeta
fixupMeta PreBamMeta{..} = BamMeta
{ meta_hdr = pmeta_hdr
, meta_refs = Refs . V.fromList . reverse $ pmeta_refs
, meta_pgs = snd $ evalRWS trace_pgs () pmeta_pgs
, meta_other_shit = reverse pmeta_other_shit
, meta_comment = reverse pmeta_comment }
where
trace_pgs :: RWS () [Fix BamPG] (HashMap Bytes (BamPG Bytes)) ()
trace_pgs = do
gg <- get
case foldl' (flip H.delete) gg
[ pp | p <- H.elems gg
, pp <- maybe [] pure (pg_prev_pg p) ] of
orphans
| H.null gg -> return ()
| H.null orphans -> trace_pg H.empty (head $ H.keys gg) >> trace_pgs
| otherwise -> mapM_ (trace_pg H.empty) (H.keys orphans) >> trace_pgs
trace_pg :: HashMap Bytes () -> Bytes -> RWS () [Fix BamPG] (HashMap Bytes x) (Maybe (Fix BamPG))
trace_pg closed name =
case H.lookup name pmeta_pgs of
_ | H.member name closed -> return Nothing
Nothing -> return Nothing
Just pg -> do
modify $ H.delete name
pp <- mapM (trace_pg (H.insert name () closed)) (pg_prev_pg pg)
let self = Fix $ pg { pg_prev_pg = join pp }
tell [ self ]
return $ Just self
parseBamMetaLine :: P.Parser (PreBamMeta -> PreBamMeta)
parseBamMetaLine = P.char '@' >> P.choice [hdLine, sqLine, pgLine, coLine, otherLine]
where
hdLine = P.string "HD\t" >>
(\fns meta -> meta { pmeta_hdr = foldr ($) (pmeta_hdr meta) fns })
<$> P.sepBy1 (P.choice [hdvn, hdso, hdother]) tabs
sqLine = do _ <- P.string "SQ\t"
fns <- P.sepBy1 (P.choice [sqnm, sqln, sqother]) tabs
let sq = foldr ($) (BamSQ "" (-1) []) fns
guard (not . B.null $ sq_name sq) P.<?> "SQ:NM field"
guard (sq_length sq >= 0) P.<?> "SQ:LN field"
pure $ \meta -> meta { pmeta_refs = sq : pmeta_refs meta }
pgLine = do _ <- P.string "PG\t"
fns <- P.sepBy1 (P.choice [pgid, pgpp, pgother]) tabs
let pg = foldr ($) (BamPG "" Nothing []) fns
guard (not . B.null $ pg_pref_name pg) P.<?> "PG:ID field"
pure $ \meta -> meta { pmeta_pgs = H.insert (pg_pref_name pg) pg (pmeta_pgs meta) }
hdvn = P.string "VN:" >>
(\a b hdr -> hdr { hdr_version = (a,b) })
<$> P.decimal <*> ((P.char '.' <|> P.char ':') >> P.decimal)
hdso = P.string "SO:" >>
(\s hdr -> hdr { hdr_sorting = s })
<$> P.choice [ Grouped <$ P.string "grouped"
, Queryname <$ P.string "queryname"
, Coordinate <$ P.string "coordinate"
, Unsorted <$ P.string "unsorted"
, Unknown <$ P.skipWhile (\c -> c/='\t' && c/='\n') ]
sqnm = P.string "SN:" >> (\s sq -> sq { sq_name = s }) <$> pall
sqln = P.string "LN:" >> (\i sq -> sq { sq_length = i }) <$> P.decimal
pgid = P.string "ID:" >> (\s pg -> pg { pg_pref_name = s }) <$> pall
pgpp = P.string "PP:" >> (\s pg -> pg { pg_prev_pg = Just s }) <$> pall
hdother = (\t hdr -> hdr { hdr_other_shit = t : hdr_other_shit hdr }) <$> tagother
sqother = (\t sq -> sq { sq_other_shit = t : sq_other_shit sq }) <$> tagother
pgother = (\t p -> p { pg_other_shit = t : pg_other_shit p }) <$> tagother
coLine = P.string "CO\t" >>
(\s meta -> s `seq` meta { pmeta_comment = s : pmeta_comment meta })
<$> P.takeWhile (/= 'n')
otherLine = (\k ts meta -> meta { pmeta_other_shit = (k,ts) : pmeta_other_shit meta })
<$> bamkey <*> (tabs >> P.sepBy1 tagother tabs)
tagother :: P.Parser (BamKey,Bytes)
tagother = (,) <$> bamkey <*> (P.char ':' >> pall)
tabs = P.char '\t' >> P.skipWhile (== '\t')
pall :: P.Parser Bytes
pall = P.takeWhile (\c -> c/='\t' && c/='\n')
bamkey :: P.Parser BamKey
bamkey = (\a b -> fromString [a,b]) <$> P.anyChar <*> P.anyChar
showBamMeta :: BamMeta -> Builder
showBamMeta (BamMeta h (Refs ss) pgs os cs) =
show_bam_meta_hdr h <>
foldMap show_bam_meta_seq ss <>
show_bam_pgs <>
foldMap show_bam_meta_other os <>
foldMap show_bam_meta_comment cs
where
show_bam_meta_hdr (BamHeader (major,minor) so os') =
"@HD\tVN:" <>
intDec major <> char7 '.' <> intDec minor <>
byteString (case so of Unsorted -> "\tSO:unsorted"
Grouped -> "\tSO:grouped"
Queryname -> "\tSO:queryname"
Coordinate -> "\tSO:coordinate"
Unknown -> mempty) <>
show_bam_others os'
show_bam_meta_seq (BamSQ nm ln ts) =
byteString "@SQ\tSN:" <> byteString nm <>
byteString "\tLN:" <> intDec ln <> show_bam_others ts
show_bam_meta_comment cm = byteString "@CO\t" <> byteString cm <> char7 '\n'
show_bam_meta_other (BamKey k,ts) =
char7 '@' <> word16LE k <> show_bam_others ts
show_bam_others ts =
foldMap show_bam_other ts <> char7 '\n'
show_bam_other (BamKey k,v) =
char7 '\t' <> word16LE k <> char7 ':' <> byteString v
show_bam_pgs = snd $ evalRWS (mapM_ show_bam_pg pgs) () (H.empty, H.empty)
show_bam_pg p@(Fix (BamPG pn pp po)) = do
ppid <- case pp of Nothing -> return Nothing
Just p' -> Just <$> show_bam_pg p'
gets (H.lookup p . fst) >>= \case
Just pid -> return pid
Nothing -> do
let pn' = case dropWhile isDigit . reverse $ S.unpack pn of
'-':xs -> reverse xs
_ -> S.unpack pn
pid <- gets $ \(_,hs) ->
head . filter (not . flip H.member hs) $
pn : S.pack pn' : [ S.pack $ pn' ++ '-' : (show i) | i <- [2::Int ..] ]
modify . first $ H.insert p pid
modify . second $ H.insert pid ()
tell $ byteString "@PG\tID:" <> byteString pid <>
maybe mempty (\x -> byteString "\tPP:" <> byteString x) ppid <>
show_bam_others po
return pid
newtype Refseq = Refseq { unRefseq :: Word32 } deriving (Eq, Ord, Ix, Bounded, Hashable)
instance Show Refseq where
showsPrec p (Refseq r) = showsPrec p r
instance Enum Refseq where
succ = Refseq . succ . unRefseq
pred = Refseq . pred . unRefseq
toEnum = Refseq . fromIntegral
fromEnum = fromIntegral . unRefseq
enumFrom = map Refseq . enumFrom . unRefseq
enumFromThen (Refseq a) (Refseq b) = map Refseq $ enumFromThen a b
enumFromTo (Refseq a) (Refseq b) = map Refseq $ enumFromTo a b
enumFromThenTo (Refseq a) (Refseq b) (Refseq c) = map Refseq $ enumFromThenTo a b c
isValidRefseq :: Refseq -> Bool
isValidRefseq = (/=) invalidRefseq
invalidRefseq :: Refseq
invalidRefseq = Refseq 0xffffffff
{-# INLINE invalidPos #-}
invalidPos :: Int
invalidPos = -1
{-# INLINE isValidPos #-}
isValidPos :: Int -> Bool
isValidPos = (/=) invalidPos
{-# INLINE unknownMapq #-}
unknownMapq :: Int
unknownMapq = 255
isKnownMapq :: Int -> Bool
isKnownMapq = (/=) unknownMapq
newtype Refs = Refs { unRefs :: V.Vector BamSQ } deriving Show
instance Monoid Refs where
mempty = Refs V.empty
mappend = (<>)
instance Semigroup Refs where
Refs a <> Refs b = Refs . V.fromList . nubHash $ V.toList a ++ V.toList b
getRef :: Refs -> Refseq -> BamSQ
getRef (Refs refs) (Refseq i) = fromMaybe (BamSQ "*" 0 []) $ refs V.!? fromIntegral i
flagPaired, flagProperlyPaired, flagUnmapped, flagMateUnmapped,
flagReversed, flagMateReversed, flagFirstMate, flagSecondMate,
flagAuxillary, flagSecondary, flagFailsQC, flagDuplicate,
flagSupplementary :: Int
flagPaired = 0x1
flagProperlyPaired = 0x2
flagUnmapped = 0x4
flagMateUnmapped = 0x8
flagReversed = 0x10
flagMateReversed = 0x20
flagFirstMate = 0x40
flagSecondMate = 0x80
flagAuxillary = 0x100
flagSecondary = 0x100
flagFailsQC = 0x200
flagDuplicate = 0x400
flagSupplementary = 0x800
eflagTrimmed, eflagMerged, eflagAlternative, eflagExactIndex :: Int
eflagTrimmed = 0x1
eflagMerged = 0x2
eflagAlternative = 0x4
eflagExactIndex = 0x8
compareNames :: Bytes -> Bytes -> Ordering
compareNames n m = case (uncons n, uncons m) of
( Nothing, Nothing ) -> EQ
( Just _, Nothing ) -> GT
( Nothing, Just _ ) -> LT
( Just (c,n'), Just (d,m') )
| is_digit c || is_digit d
, Just (u,n'') <- S.readInt n
, Just (v,m'') <- S.readInt m ->
case u `compare` v of
LT -> LT
GT -> GT
EQ -> n'' `compareNames` m''
| otherwise ->
case c `compare` d of
LT -> LT
GT -> GT
EQ -> n' `compareNames` m'
where
is_digit c = c2w '0' <= c && c <= c2w '9'
data MdOp = MdNum Int | MdRep Nucleotides | MdDel [Nucleotides] deriving Show
readMd :: Bytes -> Maybe [MdOp]
readMd s | S.null s = return []
| isDigit (S.head s) = do (n,t) <- S.readInt s
(MdNum n :) <$> readMd t
| S.head s == '^' = let (a,b) = S.break isDigit (S.tail s)
in (MdDel (map toNucleotides $ B.unpack a) :) <$> readMd b
| otherwise = (MdRep (toNucleotides $ B.head s) :) <$> readMd (S.tail s)
showMd :: [MdOp] -> Bytes
showMd = S.pack . flip s1 []
where
s1 (MdNum i : MdNum j : ms) = s1 (MdNum (i+j) : ms)
s1 (MdNum 0 : ms) = s1 ms
s1 (MdNum i : ms) = shows i . s1 ms
s1 (MdRep r : ms) = shows r . s1 ms
s1 (MdDel d1 : MdDel d2 : ms) = s1 (MdDel (d1++d2) : ms)
s1 (MdDel [] : ms) = s1 ms
s1 (MdDel ns : MdRep r : ms) = (:) '^' . shows ns . (:) '0' . shows r . s1 ms
s1 (MdDel ns : ms) = (:) '^' . shows ns . s1 ms
s1 [ ] = id
distinctBin :: Int -> Int -> Int
distinctBin beg len = mkbin 14 $ mkbin 17 $ mkbin 20 $ mkbin 23 $ mkbin 26 0
where end = beg + len - 1
mkbin n x = if beg `shiftR` n /= end `shiftR` n then x
else ((1 `shiftL` (29-n))-1) `div` 7 + (beg `shiftR` n)