{-# LANGUAGE OverloadedStrings, BangPatterns #-}
module Bio.Bam.Header (
        BamMeta(..),
        parseBamMeta,
        parseBamMetaLine,
        showBamMeta,
        addPG,

        BamKey(..),
        BamHeader(..),
        BamSQ(..),
        BamSorting(..),
        BamOtherShit,

        Refseq(..),
        invalidRefseq,
        isValidRefseq,
        invalidPos,
        isValidPos,
        unknownMapq,
        isKnownMapq,

        Refs,
        noRefs,
        getRef,

        compareNames,

        flagPaired,
        flagProperlyPaired,
        flagUnmapped,
        flagMateUnmapped,
        flagReversed,
        flagMateReversed,
        flagFirstMate,
        flagSecondMate,
        flagAuxillary,
        flagFailsQC,
        flagDuplicate,
        eflagTrimmed,
        eflagMerged,

        distinctBin,

        MdOp(..),
        readMd,
        showMd
    ) where

import Bio.Base
import Control.Applicative
import Data.Bits                    ( shiftL, shiftR, (.&.), (.|.) )
import Data.Char                    ( isDigit, ord, chr )
import Data.ByteString.Builder
import Data.Ix
import Data.List                    ( (\\), foldl' )
import Data.Sequence                ( (><), (|>) )
import Data.String
import Data.Version                 ( Version, showVersion )
import Data.Word                    ( Word16, Word32 )
import System.Environment           ( getArgs, getProgName )

import qualified Data.Attoparsec.ByteString.Char8   as P
import qualified Data.ByteString                    as B
import qualified Data.ByteString.Char8              as S
import qualified Data.Foldable                      as F
import qualified Data.Sequence                      as Z

data BamMeta = BamMeta {
        meta_hdr :: !BamHeader,
        meta_refs :: !Refs,
        meta_other_shit :: [(BamKey, BamOtherShit)],
        meta_comment :: [S.ByteString]
    } deriving Show

-- | Exactly two characters, for the \"named\" fields in bam.
newtype BamKey = BamKey Word16
    deriving ( Eq, Ord )

instance IsString BamKey where
    {-# INLINE fromString #-}
    fromString [a,b]
        | ord a < 256 && ord b < 256
            = BamKey . fromIntegral $ ord a .|. shiftL (ord b) 8

    fromString s
            = error $ "Not a legal BAM key: " ++ show s

instance Show BamKey where
    show (BamKey a) = [ chr (fromIntegral a .&. 0xff), chr (shiftR (fromIntegral a) 8 .&. 0xff) ]

addPG :: Maybe Version -> IO (BamMeta -> BamMeta)
addPG vn = do
    args <- getArgs
    pn   <- getProgName
    return $ go args pn
  where
    go args pn bm = bm { meta_other_shit = ("PG",pg_line) : meta_other_shit bm }
      where
        pg_line = concat [ [ ("ID", pg_id) ]
                         , [ ("PN", S.pack pn) ]
                         , [ ("CL", S.pack $ unwords args) ]
                         , maybe [] (\v -> [("VN",S.pack (showVersion v))]) vn
                         , map (\p -> ("PP",p)) (take 1 pg_pp)
                         , map (\p -> ("pp",p)) (drop 1 pg_pp) ]

        pg_id : _ = filter (not . flip elem pg_ids) . map S.pack $
                      pn : [ pn ++ '-' : show i | i <- [(1::Int)..] ]

        pg_ids = [ pgid | ("PG",fs) <- meta_other_shit bm, ("ID",pgid) <- fs ]
        pg_pps = [ pgid | ("PG",fs) <- meta_other_shit bm, ("PP",pgid) <- fs ]

        pg_pp  = pg_ids \\ pg_pps


instance Monoid BamMeta where
    mempty = BamMeta mempty noRefs [] []
    a `mappend` b = BamMeta { meta_hdr = meta_hdr a `mappend` meta_hdr b
                            , meta_refs = meta_refs a >< meta_refs b
                            , meta_other_shit = meta_other_shit a ++ meta_other_shit b
                            , meta_comment = meta_comment a ++ meta_comment b }

data BamHeader = BamHeader {
        hdr_version :: (Int, Int),
        hdr_sorting :: !BamSorting,
        hdr_other_shit :: BamOtherShit
    } deriving Show

instance Monoid BamHeader where
    mempty = BamHeader (1,0) Unknown []
    a `mappend` b = BamHeader { hdr_version = hdr_version a `min` hdr_version b
                              , hdr_sorting = let u = hdr_sorting a ; v = hdr_sorting b in if u == v then u else Unknown
                              , hdr_other_shit = hdr_other_shit a ++ hdr_other_shit b }

data BamSQ = BamSQ {
        sq_name :: Seqid,
        sq_length :: Int,
        sq_other_shit :: BamOtherShit
    } deriving Show

bad_seq :: BamSQ
bad_seq = BamSQ (error "no SN field") (error "no LN field") []

-- | Possible sorting orders from bam header.  Thanks to samtools, which
-- doesn't declare sorted files properly, we have to have the stupid
-- 'Unknown' state, too.
data BamSorting = Unknown | Unsorted | Grouped | Queryname | Coordinate | GroupSorted
    deriving (Show, Eq)

type BamOtherShit = [(BamKey, S.ByteString)]

parseBamMeta :: P.Parser BamMeta
parseBamMeta = fixup . foldl' (flip ($)) mempty <$> P.sepBy parseBamMetaLine (P.skipWhile (=='\t') >> P.char '\n')
  where
    fixup meta = meta { meta_other_shit = reverse (meta_other_shit meta)
                      , meta_comment    = reverse (meta_comment    meta) }

parseBamMetaLine :: P.Parser (BamMeta -> BamMeta)
parseBamMetaLine = P.char '@' >> P.choice [hdLine, sqLine, coLine, otherLine]
  where
    hdLine = P.string "HD\t" >>
             (\fns meta -> let fixup hdr = hdr { hdr_other_shit = reverse (hdr_other_shit hdr) }
                           in meta { meta_hdr = fixup $! foldl' (flip ($)) (meta_hdr meta) fns })
               <$> P.sepBy1 (P.choice [hdvn, hdso, hdother]) tabs

    sqLine = P.string "SQ\t" >>
             (\fns meta -> let fixup sq = sq { sq_other_shit = reverse (sq_other_shit sq) }
                               !s = fixup $ foldl' (flip ($)) bad_seq fns
                           in meta { meta_refs = meta_refs meta |> s })
               <$> P.sepBy1 (P.choice [sqnm, sqln, sqother]) tabs

    hdvn = P.string "VN:" >>
           (\a b hdr -> hdr { hdr_version = (a,b) })
             <$> P.decimal <*> ((P.char '.' <|> P.char ':') >> P.decimal)

    hdso = P.string "SO:" >>
           (\s hdr -> hdr { hdr_sorting = s })
             <$> P.choice [ Grouped     <$ P.string "grouped"
                          , Queryname   <$ P.string "queryname"
                          , Coordinate  <$ P.string "coordinate"
                          , GroupSorted <$ P.string "groupsort"
                          , Unsorted    <$ P.string "unsorted"
                          , Unknown     <$ P.skipWhile (\c -> c/='\t' && c/='\n') ]

    sqnm = P.string "SN:" >> (\s sq -> sq { sq_name = s }) <$> pall
    sqln = P.string "LN:" >> (\i sq -> sq { sq_length = i }) <$> P.decimal

    hdother = (\t hdr -> t `seq` hdr { hdr_other_shit = t : hdr_other_shit hdr }) <$> tagother
    sqother = (\t sq  -> t `seq` sq  { sq_other_shit = t : sq_other_shit sq }) <$> tagother

    coLine = P.string "CO\t" >>
             (\s meta -> s `seq` meta { meta_comment = s : meta_comment meta })
               <$> P.takeWhile (/= 'n')

    otherLine = (\k ts meta -> meta { meta_other_shit = (k,ts) : meta_other_shit meta })
                  <$> bamkey <*> (tabs >> P.sepBy1 tagother tabs)

    tagother :: P.Parser (BamKey,S.ByteString)
    tagother = (,) <$> bamkey <*> (P.char ':' >> pall)

    tabs = P.char '\t' >> P.skipWhile (== '\t')

    pall :: P.Parser S.ByteString
    pall = P.takeWhile (\c -> c/='\t' && c/='\n')

    bamkey :: P.Parser BamKey
    bamkey = (\a b -> fromString [a,b]) <$> P.anyChar <*> P.anyChar

showBamMeta :: BamMeta -> Builder
showBamMeta (BamMeta h ss os cs) =
    show_bam_meta_hdr h <>
    F.foldMap show_bam_meta_seq ss <>
    F.foldMap show_bam_meta_other os <>
    F.foldMap show_bam_meta_comment cs
  where
    show_bam_meta_hdr (BamHeader (major,minor) so os') =
        byteString "@HD\tVN:" <>
        intDec major <> char7 '.' <> intDec minor <>
        byteString (case so of Unknown     -> B.empty
                               Unsorted    -> "\tSO:unsorted"
                               Grouped     -> "\tSO:grouped"
                               Queryname   -> "\tSO:queryname"
                               Coordinate  -> "\tSO:coordinate"
                               GroupSorted -> "\tSO:groupsort") <>
        show_bam_others os'

    show_bam_meta_seq (BamSQ  _  _ []) = mempty
    show_bam_meta_seq (BamSQ nm ln ts) =
        byteString "@SQ\tSN:" <> byteString nm <>
        byteString "\tLN:" <> intDec ln <> show_bam_others ts

    show_bam_meta_comment cm = byteString "@CO\t" <> byteString cm <> char7 '\n'

    show_bam_meta_other (BamKey k,ts) =
        char7 '@' <> word16LE k <> show_bam_others ts

    show_bam_others ts =
        F.foldMap show_bam_other ts <> char7 '\n'

    show_bam_other (BamKey k,v) =
        char7 '\t' <> word16LE k <> char7 ':' <> byteString v


-- | Reference sequence in Bam
-- Bam enumerates the reference sequences and then sorts by index.  We
-- need to track that index if we want to reproduce the sorting order.
newtype Refseq = Refseq { unRefseq :: Word32 } deriving (Eq, Ord, Ix)

instance Show Refseq where
    showsPrec p (Refseq r) = showsPrec p r

instance Enum Refseq where
    succ = Refseq . succ . unRefseq
    pred = Refseq . pred . unRefseq
    toEnum = Refseq . fromIntegral
    fromEnum = fromIntegral . unRefseq
    enumFrom = map Refseq . enumFrom . unRefseq
    enumFromThen (Refseq a) (Refseq b) = map Refseq $ enumFromThen a b
    enumFromTo (Refseq a) (Refseq b) = map Refseq $ enumFromTo a b
    enumFromThenTo (Refseq a) (Refseq b) (Refseq c) = map Refseq $ enumFromThenTo a b c


-- | Tests whether a reference sequence is valid.
-- Returns true unless the the argument equals @invalidRefseq@.
isValidRefseq :: Refseq -> Bool
isValidRefseq = (/=) invalidRefseq

-- | The invalid Refseq.
-- Bam uses this value to encode a missing reference sequence.
invalidRefseq :: Refseq
invalidRefseq = Refseq 0xffffffff

-- | The invalid position.
-- Bam uses this value to encode a missing position.
{-# INLINE invalidPos #-}
invalidPos :: Int
invalidPos = -1

-- | Tests whether a position is valid.
-- Returns true unless the the argument equals @invalidPos@.
{-# INLINE isValidPos #-}
isValidPos :: Int -> Bool
isValidPos = (/=) invalidPos

{-# INLINE unknownMapq #-}
unknownMapq :: Int
unknownMapq = 255

isKnownMapq :: Int -> Bool
isKnownMapq = (/=) unknownMapq

-- | A list of reference sequences.
type Refs = Z.Seq BamSQ

-- | The empty list of references.  Needed for BAM files that don't really store alignments.
noRefs :: Refs
noRefs = Z.empty

getRef :: Refs -> Refseq -> BamSQ
getRef refs (Refseq i)
    | 0 <= i && fromIntegral i < Z.length refs = Z.index refs (fromIntegral i)
    | otherwise                                = BamSQ "*" 0 []


flagPaired, flagProperlyPaired, flagUnmapped, flagMateUnmapped, flagReversed, flagMateReversed, flagFirstMate, flagSecondMate,
 flagAuxillary, flagFailsQC, flagDuplicate :: Int

flagPaired = 0x1
flagProperlyPaired = 0x2
flagUnmapped = 0x4
flagMateUnmapped = 0x8
flagReversed = 0x10
flagMateReversed = 0x20
flagFirstMate = 0x40
flagSecondMate = 0x80
flagAuxillary = 0x100
flagFailsQC = 0x200
flagDuplicate = 0x400

eflagTrimmed, eflagMerged :: Int
eflagTrimmed       = 0x1
eflagMerged        = 0x2


-- | Compares two sequence names the way samtools does.
-- samtools sorts by "strnum_cmp":
-- . if both strings start with a digit, parse the initial
--   sequence of digits and compare numerically, if equal,
--   continue behind the numbers
-- . else compare the first characters (possibly NUL), if equal
--   continue behind them
-- . else both strings ended and the shorter one counts as
--   smaller (and that part is stupid)

compareNames :: Seqid -> Seqid -> Ordering
compareNames n m = case (B.uncons n, B.uncons m) of
        ( Nothing, Nothing ) -> EQ
        ( Just  _, Nothing ) -> GT
        ( Nothing, Just  _ ) -> LT
        ( Just (c,n'), Just (d,m') )
            | is_digit c && is_digit d ->
                let Just (u,n'') = S.readInt n
                    Just (v,m'') = S.readInt m
                in case u `compare` v of
                    LT -> LT
                    GT -> GT
                    EQ -> n'' `compareNames` m''
            | otherwise -> case c `compare` d of
                    LT -> LT
                    GT -> GT
                    EQ -> n' `compareNames` m'
  where
    is_digit c = 48 <= c && c < 58


data MdOp = MdNum Int | MdRep Nucleotides | MdDel [Nucleotides] deriving Show

readMd :: S.ByteString -> Maybe [MdOp]
readMd s | S.null s           = return []
         | isDigit (S.head s) = do (n,t) <- S.readInt s
                                   (MdNum n :) <$> readMd t
         | S.head s == '^'    = let (a,b) = S.break isDigit (S.tail s)
                                in (MdDel (map toNucleotides $ S.unpack a) :) <$> readMd b
         | otherwise          = (MdRep (toNucleotides $ S.head s) :) <$> readMd (S.tail s)

-- | Normalizes a series of 'MdOp's and encodes them in the way BAM and
-- SAM expect it.
showMd :: [MdOp] -> S.ByteString
showMd = S.pack . flip s1 []
  where
    s1 (MdNum  i : MdNum  j : ms) = s1 (MdNum (i+j) : ms)
    s1 (MdNum  0            : ms) = s1 ms
    s1 (MdNum  i            : ms) = shows i . s1 ms

    s1 (MdRep  r            : ms) = shows r . s1 ms

    s1 (MdDel d1 : MdDel d2 : ms) = s1 (MdDel (d1++d2) : ms)
    s1 (MdDel []            : ms) = s1 ms
    s1 (MdDel ns : MdRep  r : ms) = (:) '^' . shows ns . (:) '0' . shows r . s1 ms
    s1 (MdDel ns            : ms) = (:) '^' . shows ns . s1 ms
    s1 [                        ] = id


-- | Computes the "distinct bin" according to the BAM binning scheme.  If
-- an alignment starts at @pos@ and its CIGAR implies a length of @len@
-- on the reference, then it goes into bin @distinctBin pos len@.
distinctBin :: Int -> Int -> Int
distinctBin beg len = mkbin 14 $ mkbin 17 $ mkbin 20 $ mkbin 23 $ mkbin 26 0
  where end = beg + len - 1
        mkbin n x = if beg `shiftR` n /= end `shiftR` n then x
                    else ((1 `shiftL` (29-n))-1) `div` 7 + (beg `shiftR` n)