{-# LANGUAGE Rank2Types #-}

-- | Pileup, similar to Samtools
--
-- Pileup turns a sorted sequence of reads into a sequence of \"piles\",
-- one for each site where a genetic variant might be called.  We will
-- scan each read's CIGAR line and MD field in concert with the sequence
-- and effective quality.  Effective quality is the lowest available
-- quality score of QUAL, MAPQ, and BQ.  For aDNA calling, a base is
-- represented as four probabilities, derived from a position dependent
-- damage model.

module Bio.Bam.Pileup
    ( PrimChunks(..)
    , PrimBase(..)
    , PosPrimChunks
    , DamagedBase(..)
    , DmgToken(..)
    , dissect
    , CallStats(..)
    , V_Nuc(..)
    , V_Nucs(..)
    , IndelVariant(..)
    , BasePile
    , IndelPile
    , Pile'(..)
    , Pile
    , pileup
    ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude
import Bio.Streaming

import qualified Data.ByteString        as B
import qualified Data.Vector.Generic    as V
import qualified Data.Vector.Storable   as U
import qualified Streaming.Prelude      as Q

-- | The primitive pieces for genotype calling:  A position, a base
-- represented as four likelihoods, an inserted sequence, and the
-- length of a deleted sequence.  The logic is that we look at a base
-- followed by some indel, and all those indels are combined into a
-- single insertion and a single deletion.
data PrimChunks = Seek {-# UNPACK #-} !Int PrimBase                   -- ^ skip to position (at start or after N operation)
                | Indel [Nucleotides] [DamagedBase] PrimBase          -- ^ observed deletion and insertion between two bases
                | EndOfRead                                           -- ^ nothing anymore
  deriving Int -> PrimChunks -> ShowS
[PrimChunks] -> ShowS
PrimChunks -> String
(Int -> PrimChunks -> ShowS)
-> (PrimChunks -> String)
-> ([PrimChunks] -> ShowS)
-> Show PrimChunks
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [PrimChunks] -> ShowS
$cshowList :: [PrimChunks] -> ShowS
show :: PrimChunks -> String
$cshow :: PrimChunks -> String
showsPrec :: Int -> PrimChunks -> ShowS
$cshowsPrec :: Int -> PrimChunks -> ShowS
Show

data PrimBase = Base { PrimBase -> Int
_pb_wait   :: {-# UNPACK #-} !Int              -- ^ number of bases to wait due to a deletion
                     , PrimBase -> DamagedBase
_pb_base   :: {-# UNPACK #-} !DamagedBase
                     , PrimBase -> Qual
_pb_mapq   :: {-# UNPACK #-} !Qual             -- ^ map quality
                     , PrimBase -> PrimChunks
_pb_chunks ::                 PrimChunks }     -- ^ more chunks
  deriving Int -> PrimBase -> ShowS
[PrimBase] -> ShowS
PrimBase -> String
(Int -> PrimBase -> ShowS)
-> (PrimBase -> String) -> ([PrimBase] -> ShowS) -> Show PrimBase
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [PrimBase] -> ShowS
$cshowList :: [PrimBase] -> ShowS
show :: PrimBase -> String
$cshow :: PrimBase -> String
showsPrec :: Int -> PrimBase -> ShowS
$cshowsPrec :: Int -> PrimBase -> ShowS
Show

type PosPrimChunks = (Refseq, Int, Bool, PrimChunks)

-- | Represents our knowledge about a certain base, which consists of
-- the base itself (A,C,G,T, encoded as 0..3; no Ns), the quality score
-- (anything that isn't A,C,G,T becomes A with quality 0), and a
-- substitution matrix representing post-mortem but pre-sequencing
-- substitutions.
--
-- Unfortunately, none of this can be rolled into something more simple,
-- because damage and sequencing error behave so differently.
--
-- Damage information is polymorphic.  We might run with a simple
-- version (a matrix) for calling, but we need more (a matrix and a
-- mutable matrix, I think) for estimation.

data DamagedBase = DB { DamagedBase -> Nucleotide
db_call    :: {-# UNPACK #-} !Nucleotide           -- ^ called base
                      , DamagedBase -> Qual
db_qual    :: {-# UNPACK #-} !Qual                 -- ^ quality of called base
                      , DamagedBase -> DmgToken
db_dmg_tk  :: {-# UNPACK #-} !DmgToken             -- ^ damage information
                      , DamagedBase -> Int
db_dmg_pos :: {-# UNPACK #-} !Int                  -- ^ damage information
                      , DamagedBase -> Nucleotides
db_ref     :: {-# UNPACK #-} !Nucleotides }        -- ^ reference base from MD field

newtype DmgToken = DmgToken { DmgToken -> Int
fromDmgToken :: Int }

instance Show DamagedBase where
    showsPrec :: Int -> DamagedBase -> ShowS
showsPrec _ (DB n :: Nucleotide
n q :: Qual
q _ _ r :: Nucleotides
r)
        | Nucleotide -> Nucleotides
nucToNucs Nucleotide
n Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
r = Nucleotide -> ShowS
forall a. Show a => a -> ShowS
shows Nucleotide
n ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.                     (:) '@' ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Qual -> ShowS
forall a. Show a => a -> ShowS
shows Qual
q
        | Bool
otherwise        = Nucleotide -> ShowS
forall a. Show a => a -> ShowS
shows Nucleotide
n ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (:) '/' ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Nucleotides -> ShowS
forall a. Show a => a -> ShowS
shows Nucleotides
r ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (:) '@' ShowS -> ShowS -> ShowS
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Qual -> ShowS
forall a. Show a => a -> ShowS
shows Qual
q


-- | Decomposes a BAM record into chunks suitable for piling up.  We
-- pick apart the CIGAR and MD fields, and combine them with sequence
-- and quality as appropriate.  Clipped bases are removed/skipped as
-- needed.  We also apply a substitution matrix to each base, which must
-- be supplied along with the read.
{-# INLINE dissect #-}
dissect :: DmgToken -> BamRaw -> [PosPrimChunks]
dissect :: DmgToken -> BamRaw -> [PosPrimChunks]
dissect dtok :: DmgToken
dtok br :: BamRaw
br =
    if BamRec -> Bool
isUnmapped BamRec
b Bool -> Bool -> Bool
|| BamRec -> Bool
isDuplicate BamRec
b Bool -> Bool -> Bool
|| Bool -> Bool
not (Refseq -> Bool
isValidRefseq Refseq
b_rname)
    then [] else [(Refseq
b_rname, Int
b_pos, BamRec -> Bool
isReversed BamRec
b, PrimChunks
pchunks)]
  where
    b :: BamRec
b@BamRec{..} = BamRaw -> BamRec
unpackBam BamRaw
br
    pchunks :: PrimChunks
pchunks = Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase Int
b_pos 0 0 ([MdOp] -> Maybe [MdOp] -> [MdOp]
forall a. a -> Maybe a -> a
fromMaybe [] (Maybe [MdOp] -> [MdOp]) -> Maybe [MdOp] -> [MdOp]
forall a b. (a -> b) -> a -> b
$ BamRec -> Maybe [MdOp]
getMd BamRec
b)

    !max_cig :: Int
max_cig = Vector Cigar -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length Vector Cigar
b_cigar
    !max_seq :: Int
max_seq = Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length Vector_Nucs_half Nucleotides
b_seq
    !baq :: Bytes
baq     = BamKey -> BamRec -> Bytes
extAsString "BQ" BamRec
b

    -- This will compute the effective quality.  As far as I can see
    -- from the BAM spec V1.4, the qualities that matter are QUAL, MAPQ,
    -- and BAQ.  If QUAL is invalid, we replace it (arbitrarily) with
    -- 23 (assuming a rather conservative error rate of ~0.5%), BAQ is
    -- added to QUAL, and MAPQ is an upper limit for effective quality.

    get_seq :: Int -> (Nucleotides -> Nucleotides) -> DamagedBase
    get_seq :: Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq i :: Int
i f :: Nucleotides -> Nucleotides
f = case Vector_Nucs_half Nucleotides
b_seq Vector_Nucs_half Nucleotides -> Int -> Nucleotides
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
`V.unsafeIndex` Int
i of                                -- nucleotide
            n :: Nucleotides
n | Nucleotides
n Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
nucsA -> Nucleotide -> Qual -> DmgToken -> Int -> Nucleotides -> DamagedBase
DB Nucleotide
nucA Qual
qe DmgToken
dtok Int
dmg (Nucleotides -> Nucleotides
f Nucleotides
n)
              | Nucleotides
n Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
nucsC -> Nucleotide -> Qual -> DmgToken -> Int -> Nucleotides -> DamagedBase
DB Nucleotide
nucC Qual
qe DmgToken
dtok Int
dmg (Nucleotides -> Nucleotides
f Nucleotides
n)
              | Nucleotides
n Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
nucsG -> Nucleotide -> Qual -> DmgToken -> Int -> Nucleotides -> DamagedBase
DB Nucleotide
nucG Qual
qe DmgToken
dtok Int
dmg (Nucleotides -> Nucleotides
f Nucleotides
n)
              | Nucleotides
n Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
nucsT -> Nucleotide -> Qual -> DmgToken -> Int -> Nucleotides -> DamagedBase
DB Nucleotide
nucT Qual
qe DmgToken
dtok Int
dmg (Nucleotides -> Nucleotides
f Nucleotides
n)
              | Bool
otherwise  -> Nucleotide -> Qual -> DmgToken -> Int -> Nucleotides -> DamagedBase
DB Nucleotide
nucA (Word8 -> Qual
Q 0) DmgToken
dtok Int
dmg (Nucleotides -> Nucleotides
f Nucleotides
n)
      where
        !q :: Qual
q   = Qual -> (Vector Qual -> Qual) -> Maybe (Vector Qual) -> Qual
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Word8 -> Qual
Q 23) (Vector Qual -> Int -> Qual
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
`V.unsafeIndex` Int
i) Maybe (Vector Qual)
b_qual                          -- quality; invalid (0xff) becomes 30
        !q' :: Qual
q'  | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Bytes -> Int
B.length Bytes
baq = Qual
q                                            -- no BAQ available
             | Bool
otherwise = Word8 -> Qual
Q (Qual -> Word8
unQ Qual
q Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ (Bytes -> Int -> Word8
B.index Bytes
baq Int
i Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- 64))                     -- else correct for BAQ
        !qe :: Qual
qe  = Qual -> Qual -> Qual
forall a. Ord a => a -> a -> a
min Qual
q' Qual
b_mapq                                                    -- use MAPQ as upper limit
        !dmg :: Int
dmg = if Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
max_seq then Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
max_seq else Int
i

    -- Look for first base following the read's start or a gap (CIGAR
    -- code N).  Indels are skipped, since these are either bugs in the
    -- aligner or the aligner getting rid of essentially unalignable
    -- bases.
    firstBase :: Int -> Int -> Int -> [MdOp] -> PrimChunks
    firstBase :: Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase !Int
pos !Int
is !Int
ic mds :: [MdOp]
mds
        | Int
is Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
max_seq Bool -> Bool -> Bool
|| Int
ic Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
max_cig = PrimChunks
EndOfRead
        | Bool
otherwise = case Vector Cigar
b_cigar Vector Cigar -> Int -> Cigar
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
`V.unsafeIndex` Int
ic of
            Ins :* cl :: Int
cl ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase  Int
pos (Int
clInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
is) (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            SMa :* cl :: Int
cl ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase  Int
pos (Int
clInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
is) (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            Del :* cl :: Int
cl ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
cl) Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) (Int -> [MdOp] -> [MdOp]
drop_del Int
cl [MdOp]
mds)
            Nop :* cl :: Int
cl ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
cl) Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            HMa :*  _ ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase  Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            Pad :*  _ ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase  Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            Mat :*  0 ->            Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase  Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds
            Mat :*  _ -> Int -> PrimBase -> PrimChunks
Seek Int
pos (PrimBase -> PrimChunks) -> PrimBase -> PrimChunks
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
nextBase 0 Int
pos     Int
is   Int
ic 0  [MdOp]
mds
      where
        -- We have to treat (MdNum 0), because samtools actually
        -- generates(!) it all over the place and if not handled as a
        -- special case, it looks like an inconsistent MD field.
        drop_del :: Int -> [MdOp] -> [MdOp]
drop_del n :: Int
n (MdDel ns :: [Nucleotides]
ns : mds' :: [MdOp]
mds')
            | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns = [Nucleotides] -> MdOp
MdDel (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
ns) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds'
            | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns = Int -> [MdOp] -> [MdOp]
drop_del (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns) [MdOp]
mds'
            | Bool
otherwise     = [MdOp]
mds'
        drop_del n :: Int
n (MdNum 0 : mds' :: [MdOp]
mds') = Int -> [MdOp] -> [MdOp]
drop_del Int
n [MdOp]
mds'
        drop_del _ mds' :: [MdOp]
mds'     = [MdOp]
mds'

    -- Generate likelihoods for the next base.  When this gets called,
    -- we are looking at an M CIGAR operation and all the subindices are
    -- valid.
    -- I don't think we can ever get (MdDel []), but then again, who
    -- knows what crazy shit samtools decides to generate.  There is
    -- little harm in special-casing it.
    nextBase :: Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
    nextBase :: Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
nextBase !Int
wt !Int
pos !Int
is !Int
ic !Int
io mds :: [MdOp]
mds = case [MdOp]
mds of
        MdNum   0 : mds' :: [MdOp]
mds' -> Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
nextBase Int
wt Int
pos Int
is Int
ic Int
io [MdOp]
mds'
        MdDel  [] : mds' :: [MdOp]
mds' -> Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
nextBase Int
wt Int
pos Int
is Int
ic Int
io [MdOp]
mds'
        MdNum   1 : mds' :: [MdOp]
mds' -> DamagedBase -> [MdOp] -> PrimBase
nextBase' (Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
is Nucleotides -> Nucleotides
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id   ) [MdOp]
mds'
        MdNum   n :: Int
n : mds' :: [MdOp]
mds' -> DamagedBase -> [MdOp] -> PrimBase
nextBase' (Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
is Nucleotides -> Nucleotides
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id   ) (Int -> MdOp
MdNum (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds')
        MdRep ref :: Nucleotides
ref : mds' :: [MdOp]
mds' -> DamagedBase -> [MdOp] -> PrimBase
nextBase' (Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
is ((Nucleotides -> Nucleotides) -> DamagedBase)
-> (Nucleotides -> Nucleotides) -> DamagedBase
forall a b. (a -> b) -> a -> b
$ Nucleotides -> Nucleotides -> Nucleotides
forall a b. a -> b -> a
const Nucleotides
ref  ) [MdOp]
mds'
        MdDel   _ : _    -> DamagedBase -> [MdOp] -> PrimBase
nextBase' (Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
is ((Nucleotides -> Nucleotides) -> DamagedBase)
-> (Nucleotides -> Nucleotides) -> DamagedBase
forall a b. (a -> b) -> a -> b
$ Nucleotides -> Nucleotides -> Nucleotides
forall a b. a -> b -> a
const Nucleotides
nucsN) [MdOp]
mds
        [              ] -> DamagedBase -> [MdOp] -> PrimBase
nextBase' (Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
is ((Nucleotides -> Nucleotides) -> DamagedBase)
-> (Nucleotides -> Nucleotides) -> DamagedBase
forall a b. (a -> b) -> a -> b
$ Nucleotides -> Nucleotides -> Nucleotides
forall a b. a -> b -> a
const Nucleotides
nucsN) [ ]
      where
        nextBase' :: DamagedBase -> [MdOp] -> PrimBase
nextBase' ref :: DamagedBase
ref mds' :: [MdOp]
mds' = Int -> DamagedBase -> Qual -> PrimChunks -> PrimBase
Base Int
wt DamagedBase
ref Qual
b_mapq (PrimChunks -> PrimBase) -> PrimChunks -> PrimBase
forall a b. (a -> b) -> a -> b
$ [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel  [] [] (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) (Int
isInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) Int
ic (Int
ioInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) [MdOp]
mds'

    -- Look for the next indel after a base.  We collect all indels (I
    -- and D codes) into one combined operation.  If we hit N or the
    -- read's end, we drop all of it (indels next to a gap indicate
    -- trouble).  Other stuff is skipped: we could check for stuff that
    -- isn't valid in the middle of a read (H and S), but then what
    -- would we do about it anyway?  Just ignoring it is much easier and
    -- arguably at least as correct.
    nextIndel :: [[DamagedBase]] -> [Nucleotides] -> Int -> Int -> Int -> Int -> [MdOp] -> PrimChunks
    nextIndel :: [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel ins :: [[DamagedBase]]
ins del :: [Nucleotides]
del !Int
pos !Int
is !Int
ic !Int
io mds :: [MdOp]
mds
        | Int
is Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
max_seq Bool -> Bool -> Bool
|| Int
ic Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
max_cig = PrimChunks
EndOfRead
        | Bool
otherwise = case Vector Cigar
b_cigar Vector Cigar -> Int -> Cigar
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
`V.unsafeIndex` Int
ic of
            Ins :* cl :: Int
cl ->             [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel (Int -> [[DamagedBase]]
isq Int
cl) [Nucleotides]
del   Int
pos (Int
clInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
is) (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds
            SMa :* cl :: Int
cl ->             [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel  [[DamagedBase]]
ins     [Nucleotides]
del   Int
pos (Int
clInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
is) (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds
            Del :* cl :: Int
cl ->             [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel [[DamagedBase]]
ins ([Nucleotides]
del[Nucleotides] -> [Nucleotides] -> [Nucleotides]
forall a. [a] -> [a] -> [a]
++[Nucleotides]
dsq) (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
cl) Int
is (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds'
                where (dsq :: [Nucleotides]
dsq,mds' :: [MdOp]
mds') = Int -> [MdOp] -> ([Nucleotides], [MdOp])
split_del Int
cl [MdOp]
mds
            Pad :*  _ ->             [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel  [[DamagedBase]]
ins     [Nucleotides]
del   Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds
            HMa :*  _ ->             [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel  [[DamagedBase]]
ins     [Nucleotides]
del   Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds
            Nop :* cl :: Int
cl ->             Int -> Int -> Int -> [MdOp] -> PrimChunks
firstBase               (Int
posInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
cl) Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1)   [MdOp]
mds      -- ends up generating a 'Seek'
            Mat :* cl :: Int
cl | Int
io Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
cl  -> [[DamagedBase]]
-> [Nucleotides]
-> Int
-> Int
-> Int
-> Int
-> [MdOp]
-> PrimChunks
nextIndel  [[DamagedBase]]
ins     [Nucleotides]
del   Int
pos     Int
is  (Int
icInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) 0 [MdOp]
mds
                      | Bool
otherwise -> [Nucleotides] -> [DamagedBase] -> PrimBase -> PrimChunks
indel [Nucleotides]
del [DamagedBase]
out (PrimBase -> PrimChunks) -> PrimBase -> PrimChunks
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int -> Int -> Int -> [MdOp] -> PrimBase
nextBase ([Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
del) Int
pos Int
is Int
ic Int
io [MdOp]
mds -- ends up generating a 'Base'
      where
        indel :: [Nucleotides] -> [DamagedBase] -> PrimBase -> PrimChunks
indel d :: [Nucleotides]
d o :: [DamagedBase]
o k :: PrimBase
k = (DamagedBase -> PrimChunks -> PrimChunks)
-> PrimChunks -> [DamagedBase] -> PrimChunks
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr DamagedBase -> PrimChunks -> PrimChunks
forall a b. a -> b -> b
seq ([Nucleotides] -> [DamagedBase] -> PrimBase -> PrimChunks
Indel [Nucleotides]
d [DamagedBase]
o PrimBase
k) [DamagedBase]
o
        out :: [DamagedBase]
out    = [[DamagedBase]] -> [DamagedBase]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat ([[DamagedBase]] -> [DamagedBase])
-> [[DamagedBase]] -> [DamagedBase]
forall a b. (a -> b) -> a -> b
$ [[DamagedBase]] -> [[DamagedBase]]
forall a. [a] -> [a]
reverse [[DamagedBase]]
ins
        isq :: Int -> [[DamagedBase]]
isq cl :: Int
cl = [ Int -> (Nucleotides -> Nucleotides) -> DamagedBase
get_seq Int
i ((Nucleotides -> Nucleotides) -> DamagedBase)
-> (Nucleotides -> Nucleotides) -> DamagedBase
forall a b. (a -> b) -> a -> b
$ Nucleotides -> Nucleotides -> Nucleotides
forall a b. a -> b -> a
const Nucleotides
gap | Int
i <- [Int
is..Int
isInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
clInt -> Int -> Int
forall a. Num a => a -> a -> a
-1] ] [DamagedBase] -> [[DamagedBase]] -> [[DamagedBase]]
forall a. a -> [a] -> [a]
: [[DamagedBase]]
ins

        -- We have to treat (MdNum 0), because samtools actually
        -- generates(!) it all over the place and if not handled as a
        -- special case, it looks like an incinsistend MD field.
        split_del :: Int -> [MdOp] -> ([Nucleotides], [MdOp])
split_del n :: Int
n (MdDel ns :: [Nucleotides]
ns : mds' :: [MdOp]
mds')
            | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns = (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
take Int
n [Nucleotides]
ns, [Nucleotides] -> MdOp
MdDel (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
ns) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds')
            | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns = let (ns' :: [Nucleotides]
ns', mds'' :: [MdOp]
mds'') = Int -> [MdOp] -> ([Nucleotides], [MdOp])
split_del (Int
n Int -> Int -> Int
forall a. Num a => a -> a -> a
- [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
ns) [MdOp]
mds' in ([Nucleotides]
ns[Nucleotides] -> [Nucleotides] -> [Nucleotides]
forall a. [a] -> [a] -> [a]
++[Nucleotides]
ns', [MdOp]
mds'')
            | Bool
otherwise     = ([Nucleotides]
ns, [MdOp]
mds')
        split_del n :: Int
n (MdNum 0 : mds' :: [MdOp]
mds') = Int -> [MdOp] -> ([Nucleotides], [MdOp])
split_del Int
n [MdOp]
mds'
        split_del n :: Int
n mds' :: [MdOp]
mds'    = (Int -> Nucleotides -> [Nucleotides]
forall a. Int -> a -> [a]
replicate Int
n Nucleotides
nucsN, [MdOp]
mds')

-- | Statistics about a genotype call.  Probably only useful for
-- filtering (so not very useful), but we keep them because it's easy to
-- track them.

data CallStats = CallStats
    { CallStats -> Int
read_depth       :: {-# UNPACK #-} !Int       -- number of contributing reads
    , CallStats -> Int
reads_mapq0      :: {-# UNPACK #-} !Int       -- number of (non-)contributing reads with MAPQ==0
    , CallStats -> Int
sum_mapq         :: {-# UNPACK #-} !Int       -- sum of map qualities of contributing reads
    , CallStats -> Int
sum_mapq_squared :: {-# UNPACK #-} !Int }     -- sum of squared map qualities of contributing reads
  deriving (Int -> CallStats -> ShowS
[CallStats] -> ShowS
CallStats -> String
(Int -> CallStats -> ShowS)
-> (CallStats -> String)
-> ([CallStats] -> ShowS)
-> Show CallStats
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CallStats] -> ShowS
$cshowList :: [CallStats] -> ShowS
show :: CallStats -> String
$cshow :: CallStats -> String
showsPrec :: Int -> CallStats -> ShowS
$cshowsPrec :: Int -> CallStats -> ShowS
Show, CallStats -> CallStats -> Bool
(CallStats -> CallStats -> Bool)
-> (CallStats -> CallStats -> Bool) -> Eq CallStats
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CallStats -> CallStats -> Bool
$c/= :: CallStats -> CallStats -> Bool
== :: CallStats -> CallStats -> Bool
$c== :: CallStats -> CallStats -> Bool
Eq, (forall x. CallStats -> Rep CallStats x)
-> (forall x. Rep CallStats x -> CallStats) -> Generic CallStats
forall x. Rep CallStats x -> CallStats
forall x. CallStats -> Rep CallStats x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep CallStats x -> CallStats
$cfrom :: forall x. CallStats -> Rep CallStats x
Generic)

instance Monoid CallStats where
    mempty :: CallStats
mempty      = $WCallStats :: Int -> Int -> Int -> Int -> CallStats
CallStats { read_depth :: Int
read_depth       = 0
                            , reads_mapq0 :: Int
reads_mapq0      = 0
                            , sum_mapq :: Int
sum_mapq         = 0
                            , sum_mapq_squared :: Int
sum_mapq_squared = 0 }
    mappend :: CallStats -> CallStats -> CallStats
mappend     = CallStats -> CallStats -> CallStats
forall a. Semigroup a => a -> a -> a
(<>)

instance Semigroup CallStats where
    x :: CallStats
x <> :: CallStats -> CallStats -> CallStats
<> y :: CallStats
y = $WCallStats :: Int -> Int -> Int -> Int -> CallStats
CallStats { read_depth :: Int
read_depth       = CallStats -> Int
read_depth CallStats
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CallStats -> Int
read_depth CallStats
y
                       , reads_mapq0 :: Int
reads_mapq0      = CallStats -> Int
reads_mapq0 CallStats
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CallStats -> Int
reads_mapq0 CallStats
y
                       , sum_mapq :: Int
sum_mapq         = CallStats -> Int
sum_mapq CallStats
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CallStats -> Int
sum_mapq CallStats
y
                       , sum_mapq_squared :: Int
sum_mapq_squared = CallStats -> Int
sum_mapq_squared CallStats
x Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CallStats -> Int
sum_mapq_squared CallStats
y }

newtype V_Nuc  = V_Nuc  (U.Vector Nucleotide)  deriving (V_Nuc -> V_Nuc -> Bool
(V_Nuc -> V_Nuc -> Bool) -> (V_Nuc -> V_Nuc -> Bool) -> Eq V_Nuc
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: V_Nuc -> V_Nuc -> Bool
$c/= :: V_Nuc -> V_Nuc -> Bool
== :: V_Nuc -> V_Nuc -> Bool
$c== :: V_Nuc -> V_Nuc -> Bool
Eq, Eq V_Nuc
Eq V_Nuc =>
(V_Nuc -> V_Nuc -> Ordering)
-> (V_Nuc -> V_Nuc -> Bool)
-> (V_Nuc -> V_Nuc -> Bool)
-> (V_Nuc -> V_Nuc -> Bool)
-> (V_Nuc -> V_Nuc -> Bool)
-> (V_Nuc -> V_Nuc -> V_Nuc)
-> (V_Nuc -> V_Nuc -> V_Nuc)
-> Ord V_Nuc
V_Nuc -> V_Nuc -> Bool
V_Nuc -> V_Nuc -> Ordering
V_Nuc -> V_Nuc -> V_Nuc
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: V_Nuc -> V_Nuc -> V_Nuc
$cmin :: V_Nuc -> V_Nuc -> V_Nuc
max :: V_Nuc -> V_Nuc -> V_Nuc
$cmax :: V_Nuc -> V_Nuc -> V_Nuc
>= :: V_Nuc -> V_Nuc -> Bool
$c>= :: V_Nuc -> V_Nuc -> Bool
> :: V_Nuc -> V_Nuc -> Bool
$c> :: V_Nuc -> V_Nuc -> Bool
<= :: V_Nuc -> V_Nuc -> Bool
$c<= :: V_Nuc -> V_Nuc -> Bool
< :: V_Nuc -> V_Nuc -> Bool
$c< :: V_Nuc -> V_Nuc -> Bool
compare :: V_Nuc -> V_Nuc -> Ordering
$ccompare :: V_Nuc -> V_Nuc -> Ordering
$cp1Ord :: Eq V_Nuc
Ord, Int -> V_Nuc -> ShowS
[V_Nuc] -> ShowS
V_Nuc -> String
(Int -> V_Nuc -> ShowS)
-> (V_Nuc -> String) -> ([V_Nuc] -> ShowS) -> Show V_Nuc
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [V_Nuc] -> ShowS
$cshowList :: [V_Nuc] -> ShowS
show :: V_Nuc -> String
$cshow :: V_Nuc -> String
showsPrec :: Int -> V_Nuc -> ShowS
$cshowsPrec :: Int -> V_Nuc -> ShowS
Show)
newtype V_Nucs = V_Nucs (U.Vector Nucleotides) deriving (V_Nucs -> V_Nucs -> Bool
(V_Nucs -> V_Nucs -> Bool)
-> (V_Nucs -> V_Nucs -> Bool) -> Eq V_Nucs
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: V_Nucs -> V_Nucs -> Bool
$c/= :: V_Nucs -> V_Nucs -> Bool
== :: V_Nucs -> V_Nucs -> Bool
$c== :: V_Nucs -> V_Nucs -> Bool
Eq, Eq V_Nucs
Eq V_Nucs =>
(V_Nucs -> V_Nucs -> Ordering)
-> (V_Nucs -> V_Nucs -> Bool)
-> (V_Nucs -> V_Nucs -> Bool)
-> (V_Nucs -> V_Nucs -> Bool)
-> (V_Nucs -> V_Nucs -> Bool)
-> (V_Nucs -> V_Nucs -> V_Nucs)
-> (V_Nucs -> V_Nucs -> V_Nucs)
-> Ord V_Nucs
V_Nucs -> V_Nucs -> Bool
V_Nucs -> V_Nucs -> Ordering
V_Nucs -> V_Nucs -> V_Nucs
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: V_Nucs -> V_Nucs -> V_Nucs
$cmin :: V_Nucs -> V_Nucs -> V_Nucs
max :: V_Nucs -> V_Nucs -> V_Nucs
$cmax :: V_Nucs -> V_Nucs -> V_Nucs
>= :: V_Nucs -> V_Nucs -> Bool
$c>= :: V_Nucs -> V_Nucs -> Bool
> :: V_Nucs -> V_Nucs -> Bool
$c> :: V_Nucs -> V_Nucs -> Bool
<= :: V_Nucs -> V_Nucs -> Bool
$c<= :: V_Nucs -> V_Nucs -> Bool
< :: V_Nucs -> V_Nucs -> Bool
$c< :: V_Nucs -> V_Nucs -> Bool
compare :: V_Nucs -> V_Nucs -> Ordering
$ccompare :: V_Nucs -> V_Nucs -> Ordering
$cp1Ord :: Eq V_Nucs
Ord, Int -> V_Nucs -> ShowS
[V_Nucs] -> ShowS
V_Nucs -> String
(Int -> V_Nucs -> ShowS)
-> (V_Nucs -> String) -> ([V_Nucs] -> ShowS) -> Show V_Nucs
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [V_Nucs] -> ShowS
$cshowList :: [V_Nucs] -> ShowS
show :: V_Nucs -> String
$cshow :: V_Nucs -> String
showsPrec :: Int -> V_Nucs -> ShowS
$cshowsPrec :: Int -> V_Nucs -> ShowS
Show)

data IndelVariant = IndelVariant { IndelVariant -> V_Nucs
deleted_bases  :: !V_Nucs, IndelVariant -> V_Nuc
inserted_bases :: !V_Nuc }
      deriving (IndelVariant -> IndelVariant -> Bool
(IndelVariant -> IndelVariant -> Bool)
-> (IndelVariant -> IndelVariant -> Bool) -> Eq IndelVariant
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: IndelVariant -> IndelVariant -> Bool
$c/= :: IndelVariant -> IndelVariant -> Bool
== :: IndelVariant -> IndelVariant -> Bool
$c== :: IndelVariant -> IndelVariant -> Bool
Eq, Eq IndelVariant
Eq IndelVariant =>
(IndelVariant -> IndelVariant -> Ordering)
-> (IndelVariant -> IndelVariant -> Bool)
-> (IndelVariant -> IndelVariant -> Bool)
-> (IndelVariant -> IndelVariant -> Bool)
-> (IndelVariant -> IndelVariant -> Bool)
-> (IndelVariant -> IndelVariant -> IndelVariant)
-> (IndelVariant -> IndelVariant -> IndelVariant)
-> Ord IndelVariant
IndelVariant -> IndelVariant -> Bool
IndelVariant -> IndelVariant -> Ordering
IndelVariant -> IndelVariant -> IndelVariant
forall a.
Eq a =>
(a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: IndelVariant -> IndelVariant -> IndelVariant
$cmin :: IndelVariant -> IndelVariant -> IndelVariant
max :: IndelVariant -> IndelVariant -> IndelVariant
$cmax :: IndelVariant -> IndelVariant -> IndelVariant
>= :: IndelVariant -> IndelVariant -> Bool
$c>= :: IndelVariant -> IndelVariant -> Bool
> :: IndelVariant -> IndelVariant -> Bool
$c> :: IndelVariant -> IndelVariant -> Bool
<= :: IndelVariant -> IndelVariant -> Bool
$c<= :: IndelVariant -> IndelVariant -> Bool
< :: IndelVariant -> IndelVariant -> Bool
$c< :: IndelVariant -> IndelVariant -> Bool
compare :: IndelVariant -> IndelVariant -> Ordering
$ccompare :: IndelVariant -> IndelVariant -> Ordering
$cp1Ord :: Eq IndelVariant
Ord, Int -> IndelVariant -> ShowS
[IndelVariant] -> ShowS
IndelVariant -> String
(Int -> IndelVariant -> ShowS)
-> (IndelVariant -> String)
-> ([IndelVariant] -> ShowS)
-> Show IndelVariant
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [IndelVariant] -> ShowS
$cshowList :: [IndelVariant] -> ShowS
show :: IndelVariant -> String
$cshow :: IndelVariant -> String
showsPrec :: Int -> IndelVariant -> ShowS
$cshowsPrec :: Int -> IndelVariant -> ShowS
Show, (forall x. IndelVariant -> Rep IndelVariant x)
-> (forall x. Rep IndelVariant x -> IndelVariant)
-> Generic IndelVariant
forall x. Rep IndelVariant x -> IndelVariant
forall x. IndelVariant -> Rep IndelVariant x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep IndelVariant x -> IndelVariant
$cfrom :: forall x. IndelVariant -> Rep IndelVariant x
Generic)


-- | Map quality and a list of encountered bases, with damage
-- information and reference base if known.
type BasePile  =                          [DamagedBase]

-- | Map quality and a list of encountered indel variants.  The deletion
-- has the reference sequence, if known, an insertion has the inserted
-- sequence with damage information.
type IndelPile = [( Qual, ([Nucleotides], [DamagedBase]) )]   -- a list of indel variants

-- | Running pileup results in a series of piles.  A 'Pile' has the
-- basic statistics of a 'VarCall', but no likelihood values and a
-- pristine list of variants instead of a proper call.  We emit one pile
-- with two 'BasePile's (one for each strand) and one 'IndelPile' (the
-- one immediately following) at a time.

data Pile' a b = Pile { Pile' a b -> Refseq
p_refseq     :: {-# UNPACK #-} !Refseq
                      , Pile' a b -> Int
p_pos        :: {-# UNPACK #-} !Int
                      , Pile' a b -> CallStats
p_snp_stat   :: {-# UNPACK #-} !CallStats
                      , Pile' a b -> a
p_snp_pile   :: a
                      , Pile' a b -> CallStats
p_indel_stat :: {-# UNPACK #-} !CallStats
                      , Pile' a b -> b
p_indel_pile :: b }
  deriving Int -> Pile' a b -> ShowS
[Pile' a b] -> ShowS
Pile' a b -> String
(Int -> Pile' a b -> ShowS)
-> (Pile' a b -> String)
-> ([Pile' a b] -> ShowS)
-> Show (Pile' a b)
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall a b. (Show a, Show b) => Int -> Pile' a b -> ShowS
forall a b. (Show a, Show b) => [Pile' a b] -> ShowS
forall a b. (Show a, Show b) => Pile' a b -> String
showList :: [Pile' a b] -> ShowS
$cshowList :: forall a b. (Show a, Show b) => [Pile' a b] -> ShowS
show :: Pile' a b -> String
$cshow :: forall a b. (Show a, Show b) => Pile' a b -> String
showsPrec :: Int -> Pile' a b -> ShowS
$cshowsPrec :: forall a b. (Show a, Show b) => Int -> Pile' a b -> ShowS
Show

-- | Raw pile.  Bases and indels are piled separately on forward and
-- backward strands.
type Pile = Pile' (BasePile, BasePile) (IndelPile, IndelPile)

-- | The pileup enumeratee takes 'BamRaw's, dissects them, interleaves
-- the pieces appropriately, and generates 'Pile's.  The output will
-- contain at most one 'BasePile' and one 'IndelPile' for each position,
-- piles are sorted by position.
--
-- This top level driver receives 'BamRaw's.  Unaligned reads and
-- duplicates are skipped (but not those merely failing quality checks).
-- Processing stops when the first read with invalid 'br_rname' is
-- encountered or a t end of file.

{-# INLINE pileup #-}
pileup :: Monad m => Stream (Of PosPrimChunks) m b -> Stream (Of Pile) m b
pileup :: Stream (Of PosPrimChunks) m b -> Stream (Of Pile) m b
pileup = PileM m () -> (() -> PileF m b) -> PileF m b
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
pileup' () -> PileF m b
forall (t :: (* -> *) -> * -> *) (m :: * -> *) p p a a a a.
(MonadTrans t, Monad m) =>
()
-> p
-> p
-> ([a], [a])
-> (Heap, Heap)
-> Stream (Of a) m a
-> t m a
finish (Word32 -> Refseq
Refseq 0) 0 ([],[]) (Heap
Empty,Heap
Empty)
  where
    finish :: ()
-> p
-> p
-> ([a], [a])
-> (Heap, Heap)
-> Stream (Of a) m a
-> t m a
finish () _ _ ([],[]) (Empty,Empty) inp :: Stream (Of a) m a
inp = m a -> t m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Stream (Of a) m a -> m a
forall (m :: * -> *) a r. Monad m => Stream (Of a) m r -> m r
Q.effects Stream (Of a) m a
inp)
    finish () _ _ _ _ _ = String -> t m a
forall a. HasCallStack => String -> a
error "logic error: leftovers after pileup"


-- | The pileup logic keeps a current coordinate (just two integers) and
-- two running queues: one of /active/ 'PrimBase's that contribute to
-- current genotype calling and on of /waiting/ 'PrimBase's that will
-- contribute at a later point.
--
-- This is the CPS version of multiple state and environment monads.  It
-- is somewhat faster than direct style and gives more control over when
-- evaluation happens.

newtype PileM m a = PileM { PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM :: forall r . (a -> PileF m r) -> PileF m r }

-- | The things we drag along in 'PileM'.  Notes:
-- * The /active/ queue is a simple stack.  We add at the front when we
--   encounter reads, which reverses them.  When traversing it, we traverse
--   reads backwards, but since we accumulate the 'BasePile', it gets reversed
--   back.  The new /active/ queue, however, is no longer reversed (as it should
--   be).  So after the traversal, we reverse it again.  (Yes, it is harder to
--   understand than using a proper deque type, but it is cheaper.
--   There may not be much point in the reversing, though.)

type PileF m r = Refseq -> Int ->                               -- current position
                 ( [PrimBase], [PrimBase] ) ->                  -- active queues
                 ( Heap, Heap ) ->                              -- waiting queues
                 Stream (Of PosPrimChunks) m r ->               -- input stream
                 Stream (Of Pile) m r                           -- output stream

instance Functor (PileM m) where
    {-# INLINE fmap #-}
    fmap :: (a -> b) -> PileM m a -> PileM m b
fmap f :: a -> b
f (PileM m :: forall r. (a -> PileF m r) -> PileF m r
m) = (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (b -> PileF m r) -> PileF m r) -> PileM m b)
-> (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall a b. (a -> b) -> a -> b
$ \k :: b -> PileF m r
k -> (a -> PileF m r) -> PileF m r
forall r. (a -> PileF m r) -> PileF m r
m (b -> PileF m r
k (b -> PileF m r) -> (a -> b) -> a -> PileF m r
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. a -> b
f)

instance Applicative (PileM m) where
    {-# INLINE pure #-}
    pure :: a -> PileM m a
pure a :: a
a = (forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (a -> PileF m r) -> PileF m r) -> PileM m a)
-> (forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
forall a b. (a -> b) -> a -> b
$ \k :: a -> PileF m r
k -> a -> PileF m r
k a
a
    {-# INLINE (<*>) #-}
    u :: PileM m (a -> b)
u <*> :: PileM m (a -> b) -> PileM m a -> PileM m b
<*> v :: PileM m a
v = (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (b -> PileF m r) -> PileF m r) -> PileM m b)
-> (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall a b. (a -> b) -> a -> b
$ \k :: b -> PileF m r
k -> PileM m (a -> b) -> ((a -> b) -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m (a -> b)
u (\a :: a -> b
a -> PileM m a -> (a -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m a
v (b -> PileF m r
k (b -> PileF m r) -> (a -> b) -> a -> PileF m r
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. a -> b
a))

instance Monad (PileM m) where
    {-# INLINE return #-}
    return :: a -> PileM m a
return a :: a
a = (forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (a -> PileF m r) -> PileF m r) -> PileM m a)
-> (forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
forall a b. (a -> b) -> a -> b
$ \k :: a -> PileF m r
k -> a -> PileF m r
k a
a
    {-# INLINE (>>=) #-}
    m :: PileM m a
m >>= :: PileM m a -> (a -> PileM m b) -> PileM m b
>>=  k :: a -> PileM m b
k = (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (b -> PileF m r) -> PileF m r) -> PileM m b)
-> (forall r. (b -> PileF m r) -> PileF m r) -> PileM m b
forall a b. (a -> b) -> a -> b
$ \k' :: b -> PileF m r
k' -> PileM m a -> (a -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m a
m (\a :: a
a -> PileM m b -> (b -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM (a -> PileM m b
k a
a) b -> PileF m r
k')

{-# INLINE upd_pos #-}
upd_pos :: (Int -> Int) -> PileM m ()
upd_pos :: (Int -> Int) -> PileM m ()
upd_pos f :: Int -> Int
f = (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (() -> PileF m r) -> PileF m r) -> PileM m ())
-> (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall a b. (a -> b) -> a -> b
$ \k :: () -> PileF m r
k r :: Refseq
r p :: Int
p -> () -> PileF m r
k () Refseq
r (Int
 -> ([PrimBase], [PrimBase])
 -> (Heap, Heap)
 -> Stream (Of PosPrimChunks) m r
 -> Stream (Of Pile) m r)
-> Int
-> ([PrimBase], [PrimBase])
-> (Heap, Heap)
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
forall a b. (a -> b) -> a -> b
$! Int -> Int
f Int
p

{-# INLINE yieldPile #-}
yieldPile :: Monad m => CallStats -> BasePile -> BasePile -> CallStats -> IndelPile -> IndelPile -> PileM m ()
yieldPile :: CallStats
-> [DamagedBase]
-> [DamagedBase]
-> CallStats
-> IndelPile
-> IndelPile
-> PileM m ()
yieldPile x1 :: CallStats
x1 x2a :: [DamagedBase]
x2a x2b :: [DamagedBase]
x2b x3 :: CallStats
x3 x4a :: IndelPile
x4a x4b :: IndelPile
x4b = (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (() -> PileF m r) -> PileF m r) -> PileM m ())
-> (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall a b. (a -> b) -> a -> b
$ \ !() -> PileF m r
kont !Refseq
r !Int
p !([PrimBase], [PrimBase])
a !(Heap, Heap)
w !Stream (Of PosPrimChunks) m r
inp ->
    let pile :: Pile
pile = Refseq
-> Int
-> CallStats
-> ([DamagedBase], [DamagedBase])
-> CallStats
-> (IndelPile, IndelPile)
-> Pile
forall a b.
Refseq -> Int -> CallStats -> a -> CallStats -> b -> Pile' a b
Pile Refseq
r Int
p CallStats
x1 ([DamagedBase]
x2a,[DamagedBase]
x2b) CallStats
x3 (IndelPile
x4a,IndelPile
x4b)
    in Pile -> Stream (Of Pile) m r -> Stream (Of Pile) m r
forall (m :: * -> *) a r.
Monad m =>
a -> Stream (Of a) m r -> Stream (Of a) m r
Q.cons Pile
pile (Stream (Of Pile) m r -> Stream (Of Pile) m r)
-> Stream (Of Pile) m r -> Stream (Of Pile) m r
forall a b. (a -> b) -> a -> b
$ () -> PileF m r
kont () Refseq
r Int
p ([PrimBase], [PrimBase])
a (Heap, Heap)
w Stream (Of PosPrimChunks) m r
inp

-- | The actual pileup algorithm.  If /active/ contains something,
-- continue here.  Else find the coordinate to continue from, which is
-- the minimum of the next /waiting/ coordinate and the next coordinate
-- in input; if found, continue there, else we're all done.
pileup' :: Monad m => PileM m ()
pileup' :: PileM m ()
pileup' = (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (() -> PileF m r) -> PileF m r) -> PileM m ())
-> (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall a b. (a -> b) -> a -> b
$ \ !() -> PileF m r
k !Refseq
refseq !Int
pos !([PrimBase], [PrimBase])
active !(Heap, Heap)
waiting inp0 :: Stream (Of PosPrimChunks) m r
inp0 -> do

    Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
inp <- m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
-> Stream
     (Of Pile)
     m
     (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
 -> Stream
      (Of Pile)
      m
      (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))))
-> m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
-> Stream
     (Of Pile)
     m
     (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall a b. (a -> b) -> a -> b
$ Stream (Of PosPrimChunks) m r
-> m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect Stream (Of PosPrimChunks) m r
inp0
    let inp1 :: Stream (Of PosPrimChunks) m r
inp1        = (r -> Stream (Of PosPrimChunks) m r)
-> (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)
    -> Stream (Of PosPrimChunks) m r)
-> Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
-> Stream (Of PosPrimChunks) m r
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either r -> Stream (Of PosPrimChunks) m r
forall (f :: * -> *) a. Applicative f => a -> f a
pure Of PosPrimChunks (Stream (Of PosPrimChunks) m r)
-> Stream (Of PosPrimChunks) m r
forall (m :: * -> *) (f :: * -> *) r.
(Monad m, Functor f) =>
f (Stream f m r) -> Stream f m r
wrap Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
inp
        cont2 :: Refseq -> Int -> Stream (Of Pile) m r
cont2 rs :: Refseq
rs po :: Int
po = PileM m () -> (() -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
pileup'' () -> PileF m r
k     Refseq
rs Int
po  ([PrimBase], [PrimBase])
active (Heap, Heap)
waiting Stream (Of PosPrimChunks) m r
inp1
        leave :: Stream (Of Pile) m r
leave       =                () -> PileF m r
k () Refseq
refseq Int
pos ([PrimBase], [PrimBase])
active (Heap, Heap)
waiting Stream (Of PosPrimChunks) m r
inp1

    case (([PrimBase], [PrimBase])
active, (Heap, Heap) -> Maybe Int
getMinKeysH (Heap, Heap)
waiting, Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
inp) of
        ( (_:_,_),       _,                   _  ) -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
refseq Int
pos
        ( (_,_:_),       _,                   _  ) -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
refseq Int
pos
        (    _,    Just nw :: Int
nw, Left              _  ) -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
refseq Int
nw
        (    _,    Nothing, Left              _  ) -> Stream (Of Pile) m r
leave
        (    _,    Nothing, Right ((r :: Refseq
r,p :: Int
p,_,_):>_) ) -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
r Int
p
        (    _,    Just nw :: Int
nw, Right ((r :: Refseq
r,p :: Int
p,_,_):>_) )
                            | (Refseq
refseq,Int
nw) (Refseq, Int) -> (Refseq, Int) -> Bool
forall a. Ord a => a -> a -> Bool
<= (Refseq
r,Int
p) -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
refseq Int
nw
                            | Bool
otherwise            -> Refseq -> Int -> Stream (Of Pile) m r
cont2 Refseq
r Int
p
  where
    getMinKeysH :: (Heap, Heap) -> Maybe Int
    getMinKeysH :: (Heap, Heap) -> Maybe Int
getMinKeysH (a :: Heap
a,b :: Heap
b) = case (Heap -> Maybe Int
getMinKeyH Heap
a, Heap -> Maybe Int
getMinKeyH Heap
b) of
        ( Nothing, Nothing ) -> Maybe Int
forall a. Maybe a
Nothing
        ( Just  x :: Int
x, Nothing ) -> Int -> Maybe Int
forall a. a -> Maybe a
Just  Int
x
        ( Nothing, Just  y :: Int
y ) -> Int -> Maybe Int
forall a. a -> Maybe a
Just  Int
y
        ( Just  x :: Int
x, Just  y :: Int
y ) -> Int -> Maybe Int
forall a. a -> Maybe a
Just (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
x Int
y)


pileup'' :: Monad m => PileM m ()
pileup'' :: PileM m ()
pileup'' = do
    -- Input is still 'BamRaw', since these can be relied on to be
    -- sorted.  First see if there is any input at the current location,
    -- if so, dissect it and add it to the appropriate queue.
    PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
p'feed_input
    PileM m ()
forall (m :: * -> *). PileM m ()
p'check_waiting
    ((fin_bsL :: CallStats
fin_bsL, fin_bpL :: [DamagedBase]
fin_bpL), (fin_bsR :: CallStats
fin_bsR, fin_bpR :: [DamagedBase]
fin_bpR), (fin_isL :: CallStats
fin_isL, fin_ipL :: IndelPile
fin_ipL), (fin_isR :: CallStats
fin_isR, fin_ipR :: IndelPile
fin_ipR)) <- PileM
  m
  ((CallStats, [DamagedBase]), (CallStats, [DamagedBase]),
   (CallStats, IndelPile), (CallStats, IndelPile))
forall (m :: * -> *).
PileM
  m
  ((CallStats, [DamagedBase]), (CallStats, [DamagedBase]),
   (CallStats, IndelPile), (CallStats, IndelPile))
p'scan_active

    -- Output, but don't bother emitting empty piles.  Note that a plain
    -- basecall still yields an entry in the 'IndelPile'.  This is necessary,
    -- because actual indel calling will want to know how many reads /did not/
    -- show the variant.  However, if no reads show any variant, and here is the
    -- first place where we notice that, the pile is useless.
    let uninteresting :: (a, (t a, t a)) -> Bool
uninteresting (_,(d :: t a
d,i :: t a
i)) = t a -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null t a
d Bool -> Bool -> Bool
&& t a -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null t a
i
    Bool -> PileM m () -> PileM m ()
forall (f :: * -> *). Applicative f => Bool -> f () -> f ()
unless ([DamagedBase] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [DamagedBase]
fin_bpL Bool -> Bool -> Bool
&& [DamagedBase] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [DamagedBase]
fin_bpR Bool -> Bool -> Bool
&& ((Qual, ([Nucleotides], [DamagedBase])) -> Bool)
-> IndelPile -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Qual, ([Nucleotides], [DamagedBase])) -> Bool
forall (t :: * -> *) (t :: * -> *) a a a.
(Foldable t, Foldable t) =>
(a, (t a, t a)) -> Bool
uninteresting IndelPile
fin_ipL Bool -> Bool -> Bool
&& ((Qual, ([Nucleotides], [DamagedBase])) -> Bool)
-> IndelPile -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Qual, ([Nucleotides], [DamagedBase])) -> Bool
forall (t :: * -> *) (t :: * -> *) a a a.
(Foldable t, Foldable t) =>
(a, (t a, t a)) -> Bool
uninteresting IndelPile
fin_ipR) (PileM m () -> PileM m ()) -> PileM m () -> PileM m ()
forall a b. (a -> b) -> a -> b
$
        CallStats
-> [DamagedBase]
-> [DamagedBase]
-> CallStats
-> IndelPile
-> IndelPile
-> PileM m ()
forall (m :: * -> *).
Monad m =>
CallStats
-> [DamagedBase]
-> [DamagedBase]
-> CallStats
-> IndelPile
-> IndelPile
-> PileM m ()
yieldPile (CallStats
fin_bsL CallStats -> CallStats -> CallStats
forall a. Semigroup a => a -> a -> a
<> CallStats
fin_bsR) [DamagedBase]
fin_bpL [DamagedBase]
fin_bpR
                  (CallStats
fin_isL CallStats -> CallStats -> CallStats
forall a. Semigroup a => a -> a -> a
<> CallStats
fin_isR) IndelPile
fin_ipL IndelPile
fin_ipR

    -- Bump coordinate and loop.  (Note that the bump to the next
    -- reference /sequence/ is done implicitly, because we will run out of
    -- reads and restart in 'pileup''.)
    (Int -> Int) -> PileM m ()
forall (m :: * -> *). (Int -> Int) -> PileM m ()
upd_pos Int -> Int
forall a. Enum a => a -> a
succ
    PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
pileup'

-- | Feeds input as long as it starts at the current position
p'feed_input :: Monad m => PileM m ()
p'feed_input :: PileM m ()
p'feed_input = (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (() -> PileF m r) -> PileF m r) -> PileM m ())
-> (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall a b. (a -> b) -> a -> b
$ \kont :: () -> PileF m r
kont rs :: Refseq
rs po :: Int
po ac :: ([PrimBase], [PrimBase])
ac@(af :: [PrimBase]
af,ar :: [PrimBase]
ar) wt :: (Heap, Heap)
wt@(wf :: Heap
wf,wr :: Heap
wr) ->
    m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
-> Stream
     (Of Pile)
     m
     (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
 -> Stream
      (Of Pile)
      m
      (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))))
-> (Stream (Of PosPrimChunks) m r
    -> m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))))
-> Stream (Of PosPrimChunks) m r
-> Stream
     (Of Pile)
     m
     (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Stream (Of PosPrimChunks) m r
-> m (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect (Stream (Of PosPrimChunks) m r
 -> Stream
      (Of Pile)
      m
      (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))))
-> (Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
    -> Stream (Of Pile) m r)
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> \case
        Right ((rs' :: Refseq
rs', po' :: Int
po', str :: Bool
str, prim :: PrimChunks
prim) :> bs :: Stream (Of PosPrimChunks) m r
bs)
            | Refseq
rs Refseq -> Refseq -> Bool
forall a. Eq a => a -> a -> Bool
== Refseq
rs' Bool -> Bool -> Bool
&& Int
po Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
po' ->
                case PrimChunks
prim of
                    EndOfRead     -> PileM m () -> (() -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
p'feed_input () -> PileF m r
kont Refseq
rs Int
po ([PrimBase], [PrimBase])
ac (Heap, Heap)
wt                                       Stream (Of PosPrimChunks) m r
bs
                    Indel _ _ !PrimBase
pb -> PileM m () -> (() -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
p'feed_input () -> PileF m r
kont Refseq
rs Int
po (if Bool
str then ([PrimBase]
af,PrimBase
pbPrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
ar) else (PrimBase
pbPrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
af,[PrimBase]
ar)) (Heap, Heap)
wt Stream (Of PosPrimChunks) m r
bs
                    Seek   !Int
p !PrimBase
pb -> PileM m () -> (() -> PileF m r) -> PileF m r
forall (m :: * -> *) a.
PileM m a -> forall r. (a -> PileF m r) -> PileF m r
runPileM PileM m ()
forall (m :: * -> *). Monad m => PileM m ()
p'feed_input () -> PileF m r
kont Refseq
rs Int
po ([PrimBase], [PrimBase])
ac (if Bool
str then (Heap
wf,Heap
wr') else (Heap
wf',Heap
wr))     Stream (Of PosPrimChunks) m r
bs
                      where wf' :: Heap
wf' = Int -> PrimBase -> Heap -> Heap -> Heap
Node Int
p PrimBase
pb Heap
Empty Heap
Empty Heap -> Heap -> Heap
`unionH` Heap
wf
                            wr' :: Heap
wr' = Int -> PrimBase -> Heap -> Heap -> Heap
Node Int
p PrimBase
pb Heap
Empty Heap
Empty Heap -> Heap -> Heap
`unionH` Heap
wr
        inp :: Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
inp -> () -> PileF m r
kont () Refseq
rs Int
po ([PrimBase], [PrimBase])
ac (Heap, Heap)
wt (Stream (Of PosPrimChunks) m r -> Stream (Of Pile) m r)
-> Stream (Of PosPrimChunks) m r -> Stream (Of Pile) m r
forall a b. (a -> b) -> a -> b
$ (r -> Stream (Of PosPrimChunks) m r)
-> (Of PosPrimChunks (Stream (Of PosPrimChunks) m r)
    -> Stream (Of PosPrimChunks) m r)
-> Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
-> Stream (Of PosPrimChunks) m r
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either r -> Stream (Of PosPrimChunks) m r
forall (f :: * -> *) a. Applicative f => a -> f a
pure Of PosPrimChunks (Stream (Of PosPrimChunks) m r)
-> Stream (Of PosPrimChunks) m r
forall (m :: * -> *) (f :: * -> *) r.
(Monad m, Functor f) =>
f (Stream f m r) -> Stream f m r
wrap Either r (Of PosPrimChunks (Stream (Of PosPrimChunks) m r))
inp

-- | Checks /waiting/ queue.  If there is anything waiting for the
-- current position, moves it to /active/ queue.
p'check_waiting :: PileM m ()
p'check_waiting :: PileM m ()
p'check_waiting = (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r. (() -> PileF m r) -> PileF m r) -> PileM m ())
-> (forall r. (() -> PileF m r) -> PileF m r) -> PileM m ()
forall a b. (a -> b) -> a -> b
$ \kont :: () -> PileF m r
kont rs :: Refseq
rs po :: Int
po (af0 :: [PrimBase]
af0,ar0 :: [PrimBase]
ar0) (wf0 :: Heap
wf0,wr0 :: Heap
wr0) ->
        let go1 :: [PrimBase]
-> Heap -> Stream (Of PosPrimChunks) m r -> Stream (Of Pile) m r
go1 af :: [PrimBase]
af wf :: Heap
wf = case Heap -> Maybe (Int, PrimBase, Heap)
viewMinH Heap
wf of
                Just (!Int
mk, !PrimBase
pb, !Heap
wf') | Int
mk Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
po -> [PrimBase]
-> Heap -> Stream (Of PosPrimChunks) m r -> Stream (Of Pile) m r
go1 (PrimBase
pbPrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
af) Heap
wf'
                _                                -> [PrimBase]
-> Heap
-> [PrimBase]
-> Heap
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
go2 [PrimBase]
af Heap
wf [PrimBase]
ar0 Heap
wr0

            go2 :: [PrimBase]
-> Heap
-> [PrimBase]
-> Heap
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
go2 af :: [PrimBase]
af wf :: Heap
wf ar :: [PrimBase]
ar wr :: Heap
wr = case Heap -> Maybe (Int, PrimBase, Heap)
viewMinH Heap
wr of
                Just (!Int
mk, !PrimBase
pb, !Heap
wr') | Int
mk Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
po -> [PrimBase]
-> Heap
-> [PrimBase]
-> Heap
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
go2 [PrimBase]
af Heap
wf (PrimBase
pbPrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
ar) Heap
wr'
                _                                -> () -> PileF m r
kont () Refseq
rs Int
po ([PrimBase]
af,[PrimBase]
ar) (Heap
wf,Heap
wr)

        in [PrimBase]
-> Heap -> Stream (Of PosPrimChunks) m r -> Stream (Of Pile) m r
go1 [PrimBase]
af0 Heap
wf0


-- | Separately scans the two /active/ queues and makes one 'BasePile'
-- from each.  Also sees what's next in the 'PrimChunks':  'Indel's
-- contribute to two separate 'IndelPile's, 'Seek's are pushed back to
-- the /waiting/ queue, 'EndOfRead's are removed, and everything else is
-- added to two fresh /active/ queues.
p'scan_active :: PileM m (( CallStats, BasePile ),  ( CallStats, BasePile ),
                          ( CallStats, IndelPile ), ( CallStats, IndelPile ))
p'scan_active :: PileM
  m
  ((CallStats, [DamagedBase]), (CallStats, [DamagedBase]),
   (CallStats, IndelPile), (CallStats, IndelPile))
p'scan_active = do
    (bpf :: (CallStats, [DamagedBase])
bpf,ipf :: (CallStats, IndelPile)
ipf) <- (forall r.
 (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
 -> PileF m r)
-> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile))
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r.
  (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
  -> PileF m r)
 -> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile)))
-> (forall r.
    (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
    -> PileF m r)
-> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile))
forall a b. (a -> b) -> a -> b
$ \kont :: ((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r
kont rs :: Refseq
rs pos :: Int
pos (af :: [PrimBase]
af,ar :: [PrimBase]
ar) (wf :: Heap
wf,wr :: Heap
wr) -> (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase]
 -> Heap
 -> Stream (Of PosPrimChunks) m r
 -> Stream (Of Pile) m r)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
forall p.
(((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go (\r :: ((CallStats, [DamagedBase]), (CallStats, IndelPile))
r af' :: [PrimBase]
af' wf' :: Heap
wf' -> ((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r
kont ((CallStats, [DamagedBase]), (CallStats, IndelPile))
r Refseq
rs Int
pos ([PrimBase]
af',[PrimBase]
ar) (Heap
wf',Heap
wr)) [] Heap
wf (CallStats, [DamagedBase])
forall a. Monoid a => a
mempty (CallStats, IndelPile)
forall a. Monoid a => a
mempty [PrimBase]
af
    (bpr :: (CallStats, [DamagedBase])
bpr,ipr :: (CallStats, IndelPile)
ipr) <- (forall r.
 (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
 -> PileF m r)
-> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile))
forall (m :: * -> *) a.
(forall r. (a -> PileF m r) -> PileF m r) -> PileM m a
PileM ((forall r.
  (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
  -> PileF m r)
 -> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile)))
-> (forall r.
    (((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r)
    -> PileF m r)
-> PileM m ((CallStats, [DamagedBase]), (CallStats, IndelPile))
forall a b. (a -> b) -> a -> b
$ \kont :: ((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r
kont rs :: Refseq
rs pos :: Int
pos (af :: [PrimBase]
af,ar :: [PrimBase]
ar) (wf :: Heap
wf,wr :: Heap
wr) -> (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase]
 -> Heap
 -> Stream (Of PosPrimChunks) m r
 -> Stream (Of Pile) m r)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> Stream (Of PosPrimChunks) m r
-> Stream (Of Pile) m r
forall p.
(((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go (\r :: ((CallStats, [DamagedBase]), (CallStats, IndelPile))
r ar' :: [PrimBase]
ar' wr' :: Heap
wr' -> ((CallStats, [DamagedBase]), (CallStats, IndelPile)) -> PileF m r
kont ((CallStats, [DamagedBase]), (CallStats, IndelPile))
r Refseq
rs Int
pos ([PrimBase]
af,[PrimBase]
ar') (Heap
wf,Heap
wr')) [] Heap
wr (CallStats, [DamagedBase])
forall a. Monoid a => a
mempty (CallStats, IndelPile)
forall a. Monoid a => a
mempty [PrimBase]
ar
    ((CallStats, [DamagedBase]), (CallStats, [DamagedBase]),
 (CallStats, IndelPile), (CallStats, IndelPile))
-> PileM
     m
     ((CallStats, [DamagedBase]), (CallStats, [DamagedBase]),
      (CallStats, IndelPile), (CallStats, IndelPile))
forall (m :: * -> *) a. Monad m => a -> m a
return ((CallStats, [DamagedBase])
bpf,(CallStats, [DamagedBase])
bpr,(CallStats, IndelPile)
ipf,(CallStats, IndelPile)
ipr)
  where
    go :: (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go k :: ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k ![PrimBase]
ac !Heap
wt !(CallStats, [DamagedBase])
bpile !(CallStats, IndelPile)
ipile [                           ] = ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k ((CallStats, [DamagedBase])
bpile, (CallStats, IndelPile)
ipile) ([PrimBase] -> [PrimBase]
forall a. [a] -> [a]
reverse [PrimBase]
ac) Heap
wt
    go k :: ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k ![PrimBase]
ac !Heap
wt !(CallStats, [DamagedBase])
bpile !(CallStats, IndelPile)
ipile (Base nwt :: Int
nwt qs :: DamagedBase
qs mq :: Qual
mq pchunks :: PrimChunks
pchunks : bs :: [PrimBase]
bs) =
        case PrimChunks
pchunks of
            _ | Int
nwt Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> 0     -> PrimBase
b' PrimBase -> p -> p
forall a b. a -> b -> b
`seq` (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k  (PrimBase
b'PrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
ac)   Heap
wt     (CallStats, [DamagedBase])
bpile     (CallStats, IndelPile)
ipile  [PrimBase]
bs
            Seek p' :: Int
p' pb' :: PrimBase
pb'     -> (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k      [PrimBase]
ac (Int -> PrimBase -> Heap -> Heap
ins Int
p' PrimBase
pb' Heap
wt) ((CallStats, [DamagedBase]) -> (CallStats, [DamagedBase])
z (CallStats, [DamagedBase])
bpile)    (CallStats, IndelPile)
ipile  [PrimBase]
bs
            Indel nd :: [Nucleotides]
nd ni :: [DamagedBase]
ni pb' :: PrimBase
pb' -> (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k (PrimBase
pb'PrimBase -> [PrimBase] -> [PrimBase]
forall a. a -> [a] -> [a]
:[PrimBase]
ac)            Heap
wt  ((CallStats, [DamagedBase]) -> (CallStats, [DamagedBase])
z (CallStats, [DamagedBase])
bpile) ((CallStats, IndelPile) -> (CallStats, IndelPile)
y (CallStats, IndelPile)
ipile) [PrimBase]
bs where y :: (CallStats, IndelPile) -> (CallStats, IndelPile)
y = (Qual
 -> ([Nucleotides], [DamagedBase])
 -> (Qual, ([Nucleotides], [DamagedBase])))
-> Qual
-> ([Nucleotides], [DamagedBase])
-> (CallStats, IndelPile)
-> (CallStats, IndelPile)
forall t a.
(Qual -> t -> a)
-> Qual -> t -> (CallStats, [a]) -> (CallStats, [a])
put (,) Qual
mq ([Nucleotides]
nd,[DamagedBase]
ni)
            EndOfRead       -> (((CallStats, [DamagedBase]), (CallStats, IndelPile))
 -> [PrimBase] -> Heap -> p)
-> [PrimBase]
-> Heap
-> (CallStats, [DamagedBase])
-> (CallStats, IndelPile)
-> [PrimBase]
-> p
go ((CallStats, [DamagedBase]), (CallStats, IndelPile))
-> [PrimBase] -> Heap -> p
k      [PrimBase]
ac             Heap
wt  ((CallStats, [DamagedBase]) -> (CallStats, [DamagedBase])
z (CallStats, [DamagedBase])
bpile)    (CallStats, IndelPile)
ipile  [PrimBase]
bs
        where
            b' :: PrimBase
b' = Int -> DamagedBase -> Qual -> PrimChunks -> PrimBase
Base (Int
nwtInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) DamagedBase
qs Qual
mq PrimChunks
pchunks
            z :: (CallStats, [DamagedBase]) -> (CallStats, [DamagedBase])
z  = (Qual -> DamagedBase -> DamagedBase)
-> Qual
-> DamagedBase
-> (CallStats, [DamagedBase])
-> (CallStats, [DamagedBase])
forall t a.
(Qual -> t -> a)
-> Qual -> t -> (CallStats, [a]) -> (CallStats, [a])
put (\q :: Qual
q x :: DamagedBase
x -> DamagedBase
x { db_qual :: Qual
db_qual = Qual -> Qual -> Qual
forall a. Ord a => a -> a -> a
min Qual
q (DamagedBase -> Qual
db_qual DamagedBase
x) }) Qual
mq DamagedBase
qs

    ins :: Int -> PrimBase -> Heap -> Heap
ins q :: Int
q v :: PrimBase
v w :: Heap
w = Int -> PrimBase -> Heap -> Heap -> Heap
Node Int
q PrimBase
v Heap
Empty Heap
Empty Heap -> Heap -> Heap
`unionH` Heap
w

    put :: (Qual -> t -> a)
-> Qual -> t -> (CallStats, [a]) -> (CallStats, [a])
put f :: Qual -> t -> a
f (Q !Word8
q) !t
x (!CallStats
st,![a]
vs) = ( CallStats
st { read_depth :: Int
read_depth       = CallStats -> Int
read_depth CallStats
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1
                                     , reads_mapq0 :: Int
reads_mapq0      = CallStats -> Int
reads_mapq0 CallStats
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (if Word8
q Word8 -> Word8 -> Bool
forall a. Eq a => a -> a -> Bool
== 0 then 1 else 0)
                                     , sum_mapq :: Int
sum_mapq         = CallStats -> Int
sum_mapq CallStats
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
q
                                     , sum_mapq_squared :: Int
sum_mapq_squared = CallStats -> Int
sum_mapq_squared CallStats
st Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
q Int -> Int -> Int
forall a. Num a => a -> a -> a
* Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
q }
                                , Qual -> t -> a
f (Word8 -> Qual
Q Word8
q) t
x a -> [a] -> [a]
forall a. a -> [a] -> [a]
: [a]
vs )


-- | We need a simple priority queue.  Here's a skew heap (specialized
-- to strict 'Int' priorities and 'PrimBase' values).
data Heap = Empty | Node {-# UNPACK #-} !Int PrimBase Heap Heap

unionH :: Heap -> Heap -> Heap
Empty                 unionH :: Heap -> Heap -> Heap
`unionH` t2 :: Heap
t2                    = Heap
t2
t1 :: Heap
t1                    `unionH` Empty                 = Heap
t1
t1 :: Heap
t1@(Node k1 :: Int
k1 x1 :: PrimBase
x1 l1 :: Heap
l1 r1 :: Heap
r1) `unionH` t2 :: Heap
t2@(Node k2 :: Int
k2 x2 :: PrimBase
x2 l2 :: Heap
l2 r2 :: Heap
r2)
   | Int
k1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
k2                                        = Int -> PrimBase -> Heap -> Heap -> Heap
Node Int
k1 PrimBase
x1 (Heap
t2 Heap -> Heap -> Heap
`unionH` Heap
r1) Heap
l1
   | Bool
otherwise                                       = Int -> PrimBase -> Heap -> Heap -> Heap
Node Int
k2 PrimBase
x2 (Heap
t1 Heap -> Heap -> Heap
`unionH` Heap
r2) Heap
l2

getMinKeyH :: Heap -> Maybe Int
getMinKeyH :: Heap -> Maybe Int
getMinKeyH Empty          = Maybe Int
forall a. Maybe a
Nothing
getMinKeyH (Node x :: Int
x _ _ _) = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
x

viewMinH :: Heap -> Maybe (Int, PrimBase, Heap)
viewMinH :: Heap -> Maybe (Int, PrimBase, Heap)
viewMinH Empty          = Maybe (Int, PrimBase, Heap)
forall a. Maybe a
Nothing
viewMinH (Node k :: Int
k v :: PrimBase
v l :: Heap
l r :: Heap
r) = (Int, PrimBase, Heap) -> Maybe (Int, PrimBase, Heap)
forall a. a -> Maybe a
Just (Int
k, PrimBase
v, Heap
l Heap -> Heap -> Heap
`unionH` Heap
r)