module Bio.Bam.Header (
BamMeta(..),
parseBamMeta,
parseBamMetaLine,
showBamMeta,
addPG,
BamKey(..),
BamHeader(..),
BamSQ(..),
BamSorting(..),
BamOtherShit,
Refseq(..),
invalidRefseq,
isValidRefseq,
invalidPos,
isValidPos,
unknownMapq,
isKnownMapq,
Refs,
noRefs,
getRef,
compareNames,
flagPaired,
flagProperlyPaired,
flagUnmapped,
flagMateUnmapped,
flagReversed,
flagMateReversed,
flagFirstMate,
flagSecondMate,
flagAuxillary,
flagFailsQC,
flagDuplicate,
eflagTrimmed,
eflagMerged,
distinctBin,
MdOp(..),
readMd,
showMd
) where
import Bio.Base
import Control.Applicative
import Data.Bits ( shiftL, shiftR, (.&.), (.|.) )
import Data.Char ( isDigit, ord, chr )
import Data.ByteString.Builder
import Data.Ix
import Data.List ( (\\), foldl' )
import Data.Sequence ( (><), (|>) )
import Data.String
import Data.Version ( Version, showVersion )
import Data.Word ( Word16, Word32 )
import System.Environment ( getArgs, getProgName )
import qualified Data.Attoparsec.ByteString.Char8 as P
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as S
import qualified Data.Foldable as F
import qualified Data.Sequence as Z
data BamMeta = BamMeta {
meta_hdr :: !BamHeader,
meta_refs :: !Refs,
meta_other_shit :: [(BamKey, BamOtherShit)],
meta_comment :: [S.ByteString]
} deriving Show
newtype BamKey = BamKey Word16
deriving ( Eq, Ord )
instance IsString BamKey where
fromString [a,b]
| ord a < 256 && ord b < 256
= BamKey . fromIntegral $ ord a .|. shiftL (ord b) 8
fromString s
= error $ "Not a legal BAM key: " ++ show s
instance Show BamKey where
show (BamKey a) = [ chr (fromIntegral a .&. 0xff), chr (shiftR (fromIntegral a) 8 .&. 0xff) ]
addPG :: Maybe Version -> IO (BamMeta -> BamMeta)
addPG vn = do
args <- getArgs
pn <- getProgName
return $ go args pn
where
go args pn bm = bm { meta_other_shit = ("PG",pg_line) : meta_other_shit bm }
where
pg_line = concat [ [ ("ID", pg_id) ]
, [ ("PN", S.pack pn) ]
, [ ("CL", S.pack $ unwords args) ]
, maybe [] (\v -> [("VN",S.pack (showVersion v))]) vn
, map (\p -> ("PP",p)) (take 1 pg_pp)
, map (\p -> ("pp",p)) (drop 1 pg_pp) ]
pg_id : _ = filter (not . flip elem pg_ids) . map S.pack $
pn : [ pn ++ '-' : show i | i <- [(1::Int)..] ]
pg_ids = [ pgid | ("PG",fs) <- meta_other_shit bm, ("ID",pgid) <- fs ]
pg_pps = [ pgid | ("PG",fs) <- meta_other_shit bm, ("PP",pgid) <- fs ]
pg_pp = pg_ids \\ pg_pps
instance Monoid BamMeta where
mempty = BamMeta mempty noRefs [] []
a `mappend` b = BamMeta { meta_hdr = meta_hdr a `mappend` meta_hdr b
, meta_refs = meta_refs a >< meta_refs b
, meta_other_shit = meta_other_shit a ++ meta_other_shit b
, meta_comment = meta_comment a ++ meta_comment b }
data BamHeader = BamHeader {
hdr_version :: (Int, Int),
hdr_sorting :: !BamSorting,
hdr_other_shit :: BamOtherShit
} deriving Show
instance Monoid BamHeader where
mempty = BamHeader (1,0) Unknown []
a `mappend` b = BamHeader { hdr_version = hdr_version a `min` hdr_version b
, hdr_sorting = let u = hdr_sorting a ; v = hdr_sorting b in if u == v then u else Unknown
, hdr_other_shit = hdr_other_shit a ++ hdr_other_shit b }
data BamSQ = BamSQ {
sq_name :: Seqid,
sq_length :: Int,
sq_other_shit :: BamOtherShit
} deriving Show
bad_seq :: BamSQ
bad_seq = BamSQ (error "no SN field") (error "no LN field") []
data BamSorting = Unknown | Unsorted | Grouped | Queryname | Coordinate | GroupSorted
deriving (Show, Eq)
type BamOtherShit = [(BamKey, S.ByteString)]
parseBamMeta :: P.Parser BamMeta
parseBamMeta = fixup . foldl' (flip ($)) mempty <$> P.sepBy parseBamMetaLine (P.skipWhile (=='\t') >> P.char '\n')
where
fixup meta = meta { meta_other_shit = reverse (meta_other_shit meta)
, meta_comment = reverse (meta_comment meta) }
parseBamMetaLine :: P.Parser (BamMeta -> BamMeta)
parseBamMetaLine = P.char '@' >> P.choice [hdLine, sqLine, coLine, otherLine]
where
hdLine = P.string "HD\t" >>
(\fns meta -> let fixup hdr = hdr { hdr_other_shit = reverse (hdr_other_shit hdr) }
in meta { meta_hdr = fixup $! foldl' (flip ($)) (meta_hdr meta) fns })
<$> P.sepBy1 (P.choice [hdvn, hdso, hdother]) tabs
sqLine = P.string "SQ\t" >>
(\fns meta -> let fixup sq = sq { sq_other_shit = reverse (sq_other_shit sq) }
!s = fixup $ foldl' (flip ($)) bad_seq fns
in meta { meta_refs = meta_refs meta |> s })
<$> P.sepBy1 (P.choice [sqnm, sqln, sqother]) tabs
hdvn = P.string "VN:" >>
(\a b hdr -> hdr { hdr_version = (a,b) })
<$> P.decimal <*> ((P.char '.' <|> P.char ':') >> P.decimal)
hdso = P.string "SO:" >>
(\s hdr -> hdr { hdr_sorting = s })
<$> P.choice [ Grouped <$ P.string "grouped"
, Queryname <$ P.string "queryname"
, Coordinate <$ P.string "coordinate"
, GroupSorted <$ P.string "groupsort"
, Unsorted <$ P.string "unsorted"
, Unknown <$ P.skipWhile (\c -> c/='\t' && c/='\n') ]
sqnm = P.string "SN:" >> (\s sq -> sq { sq_name = s }) <$> pall
sqln = P.string "LN:" >> (\i sq -> sq { sq_length = i }) <$> P.decimal
hdother = (\t hdr -> t `seq` hdr { hdr_other_shit = t : hdr_other_shit hdr }) <$> tagother
sqother = (\t sq -> t `seq` sq { sq_other_shit = t : sq_other_shit sq }) <$> tagother
coLine = P.string "CO\t" >>
(\s meta -> s `seq` meta { meta_comment = s : meta_comment meta })
<$> P.takeWhile (/= 'n')
otherLine = (\k ts meta -> meta { meta_other_shit = (k,ts) : meta_other_shit meta })
<$> bamkey <*> (tabs >> P.sepBy1 tagother tabs)
tagother :: P.Parser (BamKey,S.ByteString)
tagother = (,) <$> bamkey <*> (P.char ':' >> pall)
tabs = P.char '\t' >> P.skipWhile (== '\t')
pall :: P.Parser S.ByteString
pall = P.takeWhile (\c -> c/='\t' && c/='\n')
bamkey :: P.Parser BamKey
bamkey = (\a b -> fromString [a,b]) <$> P.anyChar <*> P.anyChar
showBamMeta :: BamMeta -> Builder
showBamMeta (BamMeta h ss os cs) =
show_bam_meta_hdr h <>
F.foldMap show_bam_meta_seq ss <>
F.foldMap show_bam_meta_other os <>
F.foldMap show_bam_meta_comment cs
where
show_bam_meta_hdr (BamHeader (major,minor) so os') =
byteString "@HD\tVN:" <>
intDec major <> char7 '.' <> intDec minor <>
byteString (case so of Unknown -> B.empty
Unsorted -> "\tSO:unsorted"
Grouped -> "\tSO:grouped"
Queryname -> "\tSO:queryname"
Coordinate -> "\tSO:coordinate"
GroupSorted -> "\tSO:groupsort") <>
show_bam_others os'
show_bam_meta_seq (BamSQ _ _ []) = mempty
show_bam_meta_seq (BamSQ nm ln ts) =
byteString "@SQ\tSN:" <> byteString nm <>
byteString "\tLN:" <> intDec ln <> show_bam_others ts
show_bam_meta_comment cm = byteString "@CO\t" <> byteString cm <> char7 '\n'
show_bam_meta_other (BamKey k,ts) =
char7 '@' <> word16LE k <> show_bam_others ts
show_bam_others ts =
F.foldMap show_bam_other ts <> char7 '\n'
show_bam_other (BamKey k,v) =
char7 '\t' <> word16LE k <> char7 ':' <> byteString v
newtype Refseq = Refseq { unRefseq :: Word32 } deriving (Eq, Ord, Ix)
instance Show Refseq where
showsPrec p (Refseq r) = showsPrec p r
instance Enum Refseq where
succ = Refseq . succ . unRefseq
pred = Refseq . pred . unRefseq
toEnum = Refseq . fromIntegral
fromEnum = fromIntegral . unRefseq
enumFrom = map Refseq . enumFrom . unRefseq
enumFromThen (Refseq a) (Refseq b) = map Refseq $ enumFromThen a b
enumFromTo (Refseq a) (Refseq b) = map Refseq $ enumFromTo a b
enumFromThenTo (Refseq a) (Refseq b) (Refseq c) = map Refseq $ enumFromThenTo a b c
isValidRefseq :: Refseq -> Bool
isValidRefseq = (/=) invalidRefseq
invalidRefseq :: Refseq
invalidRefseq = Refseq 0xffffffff
invalidPos :: Int
invalidPos = 1
isValidPos :: Int -> Bool
isValidPos = (/=) invalidPos
unknownMapq :: Int
unknownMapq = 255
isKnownMapq :: Int -> Bool
isKnownMapq = (/=) unknownMapq
type Refs = Z.Seq BamSQ
noRefs :: Refs
noRefs = Z.empty
getRef :: Refs -> Refseq -> BamSQ
getRef refs (Refseq i)
| 0 <= i && fromIntegral i < Z.length refs = Z.index refs (fromIntegral i)
| otherwise = BamSQ "*" 0 []
flagPaired, flagProperlyPaired, flagUnmapped, flagMateUnmapped, flagReversed, flagMateReversed, flagFirstMate, flagSecondMate,
flagAuxillary, flagFailsQC, flagDuplicate :: Int
flagPaired = 0x1
flagProperlyPaired = 0x2
flagUnmapped = 0x4
flagMateUnmapped = 0x8
flagReversed = 0x10
flagMateReversed = 0x20
flagFirstMate = 0x40
flagSecondMate = 0x80
flagAuxillary = 0x100
flagFailsQC = 0x200
flagDuplicate = 0x400
eflagTrimmed, eflagMerged :: Int
eflagTrimmed = 0x1
eflagMerged = 0x2
compareNames :: Seqid -> Seqid -> Ordering
compareNames n m = case (B.uncons n, B.uncons m) of
( Nothing, Nothing ) -> EQ
( Just _, Nothing ) -> GT
( Nothing, Just _ ) -> LT
( Just (c,n'), Just (d,m') )
| is_digit c && is_digit d ->
let Just (u,n'') = S.readInt n
Just (v,m'') = S.readInt m
in case u `compare` v of
LT -> LT
GT -> GT
EQ -> n'' `compareNames` m''
| otherwise -> case c `compare` d of
LT -> LT
GT -> GT
EQ -> n' `compareNames` m'
where
is_digit c = 48 <= c && c < 58
data MdOp = MdNum Int | MdRep Nucleotides | MdDel [Nucleotides] deriving Show
readMd :: S.ByteString -> Maybe [MdOp]
readMd s | S.null s = return []
| isDigit (S.head s) = do (n,t) <- S.readInt s
(MdNum n :) <$> readMd t
| S.head s == '^' = let (a,b) = S.break isDigit (S.tail s)
in (MdDel (map toNucleotides $ S.unpack a) :) <$> readMd b
| otherwise = (MdRep (toNucleotides $ S.head s) :) <$> readMd (S.tail s)
showMd :: [MdOp] -> S.ByteString
showMd = S.pack . flip s1 []
where
s1 (MdNum i : MdNum j : ms) = s1 (MdNum (i+j) : ms)
s1 (MdNum 0 : ms) = s1 ms
s1 (MdNum i : ms) = shows i . s1 ms
s1 (MdRep r : ms) = shows r . s1 ms
s1 (MdDel d1 : MdDel d2 : ms) = s1 (MdDel (d1++d2) : ms)
s1 (MdDel [] : ms) = s1 ms
s1 (MdDel ns : MdRep r : ms) = (:) '^' . shows ns . (:) '0' . shows r . s1 ms
s1 (MdDel ns : ms) = (:) '^' . shows ns . s1 ms
s1 [ ] = id
distinctBin :: Int -> Int -> Int
distinctBin beg len = mkbin 14 $ mkbin 17 $ mkbin 20 $ mkbin 23 $ mkbin 26 0
where end = beg + len 1
mkbin n x = if beg `shiftR` n /= end `shiftR` n then x
else ((1 `shiftL` (29n))1) `div` 7 + (beg `shiftR` n)