module Bio.Bam.Rmdup(
            rmdup, Collapse, cons_collapse, cheap_collapse,
            cons_collapse_keep, cheap_collapse_keep,
            check_sort, normalizeTo, wrapTo,
            ECig(..), toECig, setMD, toCigar,
            RmdupException(..), BrokenMD(..)
    ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Prelude hiding ( left, right )
import Bio.Streaming

import qualified Data.ByteString                as B
import qualified Data.ByteString.Char8          as T
import qualified Data.HashMap.Strict            as M
import qualified Data.Vector                    as VV ( fromList )
import qualified Data.Vector.Algorithms.Intro   as VV ( sortBy )
import qualified Data.Vector.Generic            as V
import qualified Data.Vector.Storable           as W
import qualified Data.Vector.Unboxed            as U
import qualified Streaming.Prelude              as Q

data Collapse m = Collapse {
        Collapse m -> [BamRec] -> m (Decision, [BamRec])
collapse  :: [BamRec] -> m (Decision,[BamRec]),  -- cluster to consensus and stuff or representative and stuff
        Collapse m -> [BamRec] -> [BamRec]
originals :: [BamRec] -> [BamRec] }              -- treatment of the redundant original reads

data Decision = Consensus      { Decision -> BamRec
fromDecision :: BamRec }
              | Representative { fromDecision :: BamRec }

cons_collapse :: MonadLog m => Qual -> Collapse m
cons_collapse :: Qual -> Collapse m
cons_collapse maxq :: Qual
maxq = ([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
forall (m :: * -> *).
([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
Collapse (Qual -> [BamRec] -> m (Decision, [BamRec])
forall (m :: * -> *).
MonadLog m =>
Qual -> [BamRec] -> m (Decision, [BamRec])
do_collapse Qual
maxq) ([BamRec] -> [BamRec] -> [BamRec]
forall a b. a -> b -> a
const [])

cons_collapse_keep :: MonadLog m => Qual -> Collapse m
cons_collapse_keep :: Qual -> Collapse m
cons_collapse_keep maxq :: Qual
maxq = ([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
forall (m :: * -> *).
([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
Collapse (Qual -> [BamRec] -> m (Decision, [BamRec])
forall (m :: * -> *).
MonadLog m =>
Qual -> [BamRec] -> m (Decision, [BamRec])
do_collapse Qual
maxq) ((BamRec -> BamRec) -> [BamRec] -> [BamRec]
forall a b. (a -> b) -> [a] -> [b]
map (\b :: BamRec
b -> BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagDuplicate }))

cheap_collapse :: Monad m => Collapse m
cheap_collapse :: Collapse m
cheap_collapse = ([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
forall (m :: * -> *).
([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
Collapse [BamRec] -> m (Decision, [BamRec])
forall (m :: * -> *). Monad m => [BamRec] -> m (Decision, [BamRec])
do_cheap_collapse ([BamRec] -> [BamRec] -> [BamRec]
forall a b. a -> b -> a
const [])

cheap_collapse_keep :: Monad m => Collapse m
cheap_collapse_keep :: Collapse m
cheap_collapse_keep = ([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
forall (m :: * -> *).
([BamRec] -> m (Decision, [BamRec]))
-> ([BamRec] -> [BamRec]) -> Collapse m
Collapse [BamRec] -> m (Decision, [BamRec])
forall (m :: * -> *). Monad m => [BamRec] -> m (Decision, [BamRec])
do_cheap_collapse ((BamRec -> BamRec) -> [BamRec] -> [BamRec]
forall a b. (a -> b) -> [a] -> [b]
map (\b :: BamRec
b -> BamRec
b { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
b Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagDuplicate }))


-- | Removes duplicates from an aligned, sorted BAM stream.
--
-- The incoming stream must be sorted by coordinate, and we check for
-- violations of that assumption.  We cannot assume that length was
-- taken into account when sorting (samtools doesn't do so), so
-- duplicates may be separated by reads that start at the same position
-- but have different length or different strand.
--
-- We are looking at three different kinds of reads:  paired reads, true
-- single ended reads, merged or trimmed reads.  They are somewhat
-- different, but here's the situation if we wanted to treat them
-- separately.  These conditions define a set of duplicates:
--
-- Merged or trimmed:  We compare the leftmost coordinates and the
-- aligned length.  If the library prep is strand-preserving, we also
-- compare the strand.
--
-- Paired: We compare both left-most coordinates (b_pos and b_mpos).  If
-- the library prep is strand-preserving, only first-mates can be
-- duplicates of first-mates.  Else a first-mate can be the duplicate of
-- a second-mate.  There may be pairs with one unmapped mate.  This is
-- not a problem as they get assigned synthetic coordinates and will be
-- handled smoothly.
--
-- True singles:  We compare only the leftmost coordinate.  It does not
-- matter if the library prep is strand-preserving, the strand always
-- matters.
--
-- Across these classes, we can see more duplicates:
--
-- Merged/trimmed and paired:  these can be duplicates if the merging
-- failed for the pair.  We would need to compare the outer coordinates
-- of the merged reads to the two 5' coordinates of the pair.  However,
-- since we don't have access to the mate, we cannot actually do
-- anything right here.  This case should be solved externally by
-- merging those pairs that overlap in coordinate space.
--
-- Single and paired:  in the single case, we only have one coordinate
-- to compare.  This will inevitably lead to trouble, as we could find
-- that the single might be the duplicate of two pairs, but those two
-- pairs are definitely not duplicates of each other.  We solve it by
-- removing the single read(s).
--
-- Single and merged/trimmed:  same trouble as in the single+paired
-- case.  We remove the single to solve it.
--
--
-- In principle, we might want to allow some wiggle room in the
-- coordinates.  So far, this has not been implemented.  It adds the
-- complication that groups of separated reads can turn into a set of
-- duplicates because of the appearance of a new reads.  Needs some
-- thinking about... or maybe it's not too important.
--
-- Once a set of duplicates is collected, we perform a majority vote on
-- the correct CIGAR line.  Of all those reads that agree on this CIGAR
-- line, a consensus is called, quality scores are adjusted and clamped
-- to a maximum, the MD field is updated and the XP field is assigned
-- the number of reads in the original cluster.  The new MAPQ becomes
-- the RMSQ of the map qualities of all reads.
--
-- Treatment of Read Groups:  We generalize by providing a "label"
-- function; only reads that have the same label are considered
-- duplicates of each other.  The typical label function would extract
-- read groups, libraries or samples.

rmdup :: (MonadLog m, MonadThrow m, Hashable l, Eq l)
      => Bool -> Collapse m -> Q.Stream (Of (l,BamRec)) m r -> Q.Stream (Of ((l,Int),BamRec)) m r
rmdup :: Bool
-> Collapse m
-> Stream (Of (l, BamRec)) m r
-> Stream (Of ((l, Int), BamRec)) m r
rmdup strand_preserved :: Bool
strand_preserved collapse_cfg :: Collapse m
collapse_cfg =
    -- Easiest way to go about this:  We simply collect everything that
    -- starts at some specific coordinate and group it appropriately.
    -- Treat the groups separately, output, go on.
    (((l, Int), BamRec) -> (Refseq, Int))
-> RmdupException
-> Stream (Of ((l, Int), BamRec)) m r
-> Stream (Of ((l, Int), BamRec)) m r
forall (m :: * -> *) b a r.
(MonadThrow m, Ord b) =>
(a -> b)
-> RmdupException -> Stream (Of a) m r -> Stream (Of a) m r
check_sort (BamRec -> (Refseq, Int)
b_cpos (BamRec -> (Refseq, Int))
-> (((l, Int), BamRec) -> BamRec)
-> ((l, Int), BamRec)
-> (Refseq, Int)
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ((l, Int), BamRec) -> BamRec
forall a b. (a, b) -> b
snd) RmdupException
InternalError (Stream (Of ((l, Int), BamRec)) m r
 -> Stream (Of ((l, Int), BamRec)) m r)
-> (Stream (Of (l, BamRec)) m r
    -> Stream (Of ((l, Int), BamRec)) m r)
-> Stream (Of (l, BamRec)) m r
-> Stream (Of ((l, Int), BamRec)) m r
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.
    ([(l, BamRec)] -> m (Vector ((l, Int), BamRec)))
-> Stream (Of (l, BamRec)) m r
-> Stream (Of ((l, Int), BamRec)) m r
forall (m :: * -> *) (f :: * -> *) a a c.
(Monad m, Foldable f) =>
([(a, BamRec)] -> m (f a))
-> Stream (Of (a, BamRec)) m c -> Stream (Of a) m c
mapGroups ( ([((l, Int), BamRec)] -> Vector ((l, Int), BamRec))
-> m [((l, Int), BamRec)] -> m (Vector ((l, Int), BamRec))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((forall s. Mutable Vector s ((l, Int), BamRec) -> ST s ())
-> Vector ((l, Int), BamRec) -> Vector ((l, Int), BamRec)
forall (v :: * -> *) a.
Vector v a =>
(forall s. Mutable v s a -> ST s ()) -> v a -> v a
V.modify (Comparison ((l, Int), BamRec)
-> MVector (PrimState (ST s)) ((l, Int), BamRec) -> ST s ()
forall (m :: * -> *) (v :: * -> * -> *) e.
(PrimMonad m, MVector v e) =>
Comparison e -> v (PrimState m) e -> m ()
VV.sortBy ((((l, Int), BamRec) -> Int) -> Comparison ((l, Int), BamRec)
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing ((((l, Int), BamRec) -> Int) -> Comparison ((l, Int), BamRec))
-> (((l, Int), BamRec) -> Int) -> Comparison ((l, Int), BamRec)
forall a b. (a -> b) -> a -> b
$ Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (Vector_Nucs_half Nucleotides -> Int)
-> (((l, Int), BamRec) -> Vector_Nucs_half Nucleotides)
-> ((l, Int), BamRec)
-> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Vector_Nucs_half Nucleotides
b_seq (BamRec -> Vector_Nucs_half Nucleotides)
-> (((l, Int), BamRec) -> BamRec)
-> ((l, Int), BamRec)
-> Vector_Nucs_half Nucleotides
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ((l, Int), BamRec) -> BamRec
forall a b. (a, b) -> b
snd)) (Vector ((l, Int), BamRec) -> Vector ((l, Int), BamRec))
-> ([((l, Int), BamRec)] -> Vector ((l, Int), BamRec))
-> [((l, Int), BamRec)]
-> Vector ((l, Int), BamRec)
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [((l, Int), BamRec)] -> Vector ((l, Int), BamRec)
forall a. [a] -> Vector a
VV.fromList)
              (m [((l, Int), BamRec)] -> m (Vector ((l, Int), BamRec)))
-> ([(l, BamRec)] -> m [((l, Int), BamRec)])
-> [(l, BamRec)]
-> m (Vector ((l, Int), BamRec))
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (HashMap l [(Int, BamRec)] -> [((l, Int), BamRec)])
-> m (HashMap l [(Int, BamRec)]) -> m [((l, Int), BamRec)]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((l
 -> [(Int, BamRec)] -> [((l, Int), BamRec)] -> [((l, Int), BamRec)])
-> [((l, Int), BamRec)]
-> HashMap l [(Int, BamRec)]
-> [((l, Int), BamRec)]
forall k v a. (k -> v -> a -> a) -> a -> HashMap k v -> a
M.foldrWithKey (\l :: l
l mrs :: [(Int, BamRec)]
mrs -> [((l, Int), BamRec)]
-> [((l, Int), BamRec)] -> [((l, Int), BamRec)]
forall a. [a] -> [a] -> [a]
(++) [ ((l
l,Int
n),BamRec
r) | (n :: Int
n,r :: BamRec
r) <- [(Int, BamRec)]
mrs ]) [])
              (m (HashMap l [(Int, BamRec)]) -> m [((l, Int), BamRec)])
-> ([(l, BamRec)] -> m (HashMap l [(Int, BamRec)]))
-> [(l, BamRec)]
-> m [((l, Int), BamRec)]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ([BamRec] -> m [(Int, BamRec)])
-> HashMap l [BamRec] -> m (HashMap l [(Int, BamRec)])
forall (t :: * -> *) (m :: * -> *) a b.
(Traversable t, Monad m) =>
(a -> m b) -> t a -> m (t b)
mapM (Bool -> Collapse m -> [BamRec] -> m [(Int, BamRec)]
forall (m :: * -> *).
Monad m =>
Bool -> Collapse m -> [BamRec] -> m [(Int, BamRec)]
do_rmdup Bool
strand_preserved Collapse m
collapse_cfg)
              (HashMap l [BamRec] -> m (HashMap l [(Int, BamRec)]))
-> ([(l, BamRec)] -> HashMap l [BamRec])
-> [(l, BamRec)]
-> m (HashMap l [(Int, BamRec)])
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ((l, BamRec) -> l)
-> ((l, BamRec) -> BamRec) -> [(l, BamRec)] -> HashMap l [BamRec]
forall k a v.
(Hashable k, Eq k) =>
(a -> k) -> (a -> v) -> [a] -> HashMap k [v]
accumMap (l, BamRec) -> l
forall a b. (a, b) -> a
fst (l, BamRec) -> BamRec
forall a b. (a, b) -> b
snd ) (Stream (Of (l, BamRec)) m r -> Stream (Of ((l, Int), BamRec)) m r)
-> (Stream (Of (l, BamRec)) m r -> Stream (Of (l, BamRec)) m r)
-> Stream (Of (l, BamRec)) m r
-> Stream (Of ((l, Int), BamRec)) m r
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.
    ((l, BamRec) -> (Refseq, Int))
-> RmdupException
-> Stream (Of (l, BamRec)) m r
-> Stream (Of (l, BamRec)) m r
forall (m :: * -> *) b a r.
(MonadThrow m, Ord b) =>
(a -> b)
-> RmdupException -> Stream (Of a) m r -> Stream (Of a) m r
check_sort (BamRec -> (Refseq, Int)
b_cpos (BamRec -> (Refseq, Int))
-> ((l, BamRec) -> BamRec) -> (l, BamRec) -> (Refseq, Int)
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (l, BamRec) -> BamRec
forall a b. (a, b) -> b
snd) RmdupException
UnsortedInput
  where
    b_cpos :: BamRec -> (Refseq, Int)
b_cpos = BamRec -> Refseq
b_rname (BamRec -> Refseq) -> (BamRec -> Int) -> BamRec -> (Refseq, Int)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& BamRec -> Int
b_pos

    mapGroups :: ([(a, BamRec)] -> m (f a))
-> Stream (Of (a, BamRec)) m c -> Stream (Of a) m c
mapGroups f :: [(a, BamRec)] -> m (f a)
f = m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)))
-> Stream
     (Of a) m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)))
 -> Stream
      (Of a) m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c))))
-> (Stream (Of (a, BamRec)) m c
    -> m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c))))
-> Stream (Of (a, BamRec)) m c
-> Stream
     (Of a) m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)))
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Stream (Of (a, BamRec)) m c
-> m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect (Stream (Of (a, BamRec)) m c
 -> Stream
      (Of a) m (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c))))
-> (Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c))
    -> Stream (Of a) m c)
-> Stream (Of (a, BamRec)) m c
-> Stream (Of a) m c
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> (c -> Stream (Of a) m c)
-> (Of (a, BamRec) (Stream (Of (a, BamRec)) m c)
    -> Stream (Of a) m c)
-> Either c (Of (a, BamRec) (Stream (Of (a, BamRec)) m c))
-> Stream (Of a) m c
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either c -> Stream (Of a) m c
forall (f :: * -> *) a. Applicative f => a -> f a
pure (\(a :: (a, BamRec)
a :> s :: Stream (Of (a, BamRec)) m c
s) -> ([(a, BamRec)] -> m (f a))
-> [(a, BamRec)]
-> (a, BamRec)
-> Stream (Of (a, BamRec)) m c
-> Stream (Of a) m c
forall (m :: * -> *) (f :: * -> *) a a b.
(Monad m, Foldable f) =>
([(a, BamRec)] -> m (f a))
-> [(a, BamRec)]
-> (a, BamRec)
-> Stream (Of (a, BamRec)) m b
-> Stream (Of a) m b
mg1 [(a, BamRec)] -> m (f a)
f [] (a, BamRec)
a Stream (Of (a, BamRec)) m c
s)

    mg1 :: ([(a, BamRec)] -> m (f a))
-> [(a, BamRec)]
-> (a, BamRec)
-> Stream (Of (a, BamRec)) m b
-> Stream (Of a) m b
mg1 f :: [(a, BamRec)] -> m (f a)
f acc :: [(a, BamRec)]
acc a :: (a, BamRec)
a s :: Stream (Of (a, BamRec)) m b
s =
        m (Either b (Of (a, BamRec) (Stream (Of (a, BamRec)) m b)))
-> Stream
     (Of a) m (Either b (Of (a, BamRec) (Stream (Of (a, BamRec)) m b)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Stream (Of (a, BamRec)) m b
-> m (Either b (Of (a, BamRec) (Stream (Of (a, BamRec)) m b)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect Stream (Of (a, BamRec)) m b
s) Stream
  (Of a) m (Either b (Of (a, BamRec) (Stream (Of (a, BamRec)) m b)))
-> (Either b (Of (a, BamRec) (Stream (Of (a, BamRec)) m b))
    -> Stream (Of a) m b)
-> Stream (Of a) m b
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= \case
            Left r :: b
r                                  ->  m (f a) -> Stream (Of a) m (f a)
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift ([(a, BamRec)] -> m (f a)
f ((a, BamRec)
a(a, BamRec) -> [(a, BamRec)] -> [(a, BamRec)]
forall a. a -> [a] -> [a]
:[(a, BamRec)]
acc)) Stream (Of a) m (f a)
-> (f a -> Stream (Of a) m ()) -> Stream (Of a) m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= f a -> Stream (Of a) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
Q.each Stream (Of a) m () -> Stream (Of a) m b -> Stream (Of a) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> b -> Stream (Of a) m b
forall (f :: * -> *) a. Applicative f => a -> f a
pure b
r
            Right (b :: (a, BamRec)
b :> s' :: Stream (Of (a, BamRec)) m b
s')
                | BamRec -> (Refseq, Int)
b_cpos ((a, BamRec) -> BamRec
forall a b. (a, b) -> b
snd (a, BamRec)
a) (Refseq, Int) -> (Refseq, Int) -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> (Refseq, Int)
b_cpos ((a, BamRec) -> BamRec
forall a b. (a, b) -> b
snd (a, BamRec)
b)  ->  ([(a, BamRec)] -> m (f a))
-> [(a, BamRec)]
-> (a, BamRec)
-> Stream (Of (a, BamRec)) m b
-> Stream (Of a) m b
mg1 [(a, BamRec)] -> m (f a)
f ((a, BamRec)
b(a, BamRec) -> [(a, BamRec)] -> [(a, BamRec)]
forall a. a -> [a] -> [a]
:[(a, BamRec)]
acc) (a, BamRec)
a Stream (Of (a, BamRec)) m b
s'
                | Bool
otherwise                         ->  m (f a) -> Stream (Of a) m (f a)
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift ([(a, BamRec)] -> m (f a)
f ((a, BamRec)
a(a, BamRec) -> [(a, BamRec)] -> [(a, BamRec)]
forall a. a -> [a] -> [a]
:[(a, BamRec)]
acc)) Stream (Of a) m (f a)
-> (f a -> Stream (Of a) m ()) -> Stream (Of a) m ()
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= f a -> Stream (Of a) m ()
forall (m :: * -> *) (f :: * -> *) a.
(Monad m, Foldable f) =>
f a -> Stream (Of a) m ()
Q.each Stream (Of a) m () -> Stream (Of a) m b -> Stream (Of a) m b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> ([(a, BamRec)] -> m (f a))
-> [(a, BamRec)]
-> (a, BamRec)
-> Stream (Of (a, BamRec)) m b
-> Stream (Of a) m b
mg1 [(a, BamRec)] -> m (f a)
f [] (a, BamRec)
b Stream (Of (a, BamRec)) m b
s'

data RmdupException = UnsortedInput | InternalError deriving (Typeable, Int -> RmdupException -> ShowS
[RmdupException] -> ShowS
RmdupException -> String
(Int -> RmdupException -> ShowS)
-> (RmdupException -> String)
-> ([RmdupException] -> ShowS)
-> Show RmdupException
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [RmdupException] -> ShowS
$cshowList :: [RmdupException] -> ShowS
show :: RmdupException -> String
$cshow :: RmdupException -> String
showsPrec :: Int -> RmdupException -> ShowS
$cshowsPrec :: Int -> RmdupException -> ShowS
Show)
instance Exception RmdupException where
    displayException :: RmdupException -> String
displayException UnsortedInput = "input must be sorted for rmdup to work"
    displayException InternalError = "internal error, output isn't sorted anymore"

check_sort :: (MonadThrow m, Ord b) => (a -> b) -> RmdupException -> Stream (Of a) m r -> Stream (Of a) m r
check_sort :: (a -> b)
-> RmdupException -> Stream (Of a) m r -> Stream (Of a) m r
check_sort f :: a -> b
f msg :: RmdupException
msg = m (Either r (Of a (Stream (Of a) m r)))
-> Stream (Of a) m (Either r (Of a (Stream (Of a) m r)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m (Either r (Of a (Stream (Of a) m r)))
 -> Stream (Of a) m (Either r (Of a (Stream (Of a) m r))))
-> (Stream (Of a) m r -> m (Either r (Of a (Stream (Of a) m r))))
-> Stream (Of a) m r
-> Stream (Of a) m (Either r (Of a (Stream (Of a) m r)))
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Stream (Of a) m r -> m (Either r (Of a (Stream (Of a) m r)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect (Stream (Of a) m r
 -> Stream (Of a) m (Either r (Of a (Stream (Of a) m r))))
-> (Either r (Of a (Stream (Of a) m r)) -> Stream (Of a) m r)
-> Stream (Of a) m r
-> Stream (Of a) m r
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> (r -> Stream (Of a) m r)
-> (Of a (Stream (Of a) m r) -> Stream (Of a) m r)
-> Either r (Of a (Stream (Of a) m r))
-> Stream (Of a) m r
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either r -> Stream (Of a) m r
forall (f :: * -> *) a. Applicative f => a -> f a
pure Of a (Stream (Of a) m r) -> Stream (Of a) m r
forall (m :: * -> *) a.
MonadThrow m =>
Of a (Stream (Of a) m a) -> Stream (Of a) m a
step
  where
    step :: Of a (Stream (Of a) m a) -> Stream (Of a) m a
step (a :: a
a :> s :: Stream (Of a) m a
s) = m (Either a (Of a (Stream (Of a) m a)))
-> Stream (Of a) m (Either a (Of a (Stream (Of a) m a)))
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (Stream (Of a) m a -> m (Either a (Of a (Stream (Of a) m a)))
forall (m :: * -> *) (f :: * -> *) r.
Monad m =>
Stream f m r -> m (Either r (f (Stream f m r)))
inspect Stream (Of a) m a
s) Stream (Of a) m (Either a (Of a (Stream (Of a) m a)))
-> (Either a (Of a (Stream (Of a) m a)) -> Stream (Of a) m a)
-> Stream (Of a) m a
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= (a -> Stream (Of a) m a)
-> (Of a (Stream (Of a) m a) -> Stream (Of a) m a)
-> Either a (Of a (Stream (Of a) m a))
-> Stream (Of a) m a
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (a -> Stream (Of a) m a -> Stream (Of a) m a
forall (m :: * -> *) a r.
Monad m =>
a -> Stream (Of a) m r -> Stream (Of a) m r
Q.cons a
a (Stream (Of a) m a -> Stream (Of a) m a)
-> (a -> Stream (Of a) m a) -> a -> Stream (Of a) m a
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. a -> Stream (Of a) m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure) (a -> Of a (Stream (Of a) m a) -> Stream (Of a) m a
step' a
a)
    step' :: a -> Of a (Stream (Of a) m a) -> Stream (Of a) m a
step' a :: a
a (b :: a
b :> s :: Stream (Of a) m a
s)
        | a -> b
f a
a b -> b -> Bool
forall a. Ord a => a -> a -> Bool
> a -> b
f a
b = m a -> Stream (Of a) m a
forall (t :: (* -> *) -> * -> *) (m :: * -> *) a.
(MonadTrans t, Monad m) =>
m a -> t m a
lift (m a -> Stream (Of a) m a) -> m a -> Stream (Of a) m a
forall a b. (a -> b) -> a -> b
$ RmdupException -> m a
forall (m :: * -> *) e a. (MonadThrow m, Exception e) => e -> m a
throwM RmdupException
msg
        | Bool
otherwise = a -> Stream (Of a) m a -> Stream (Of a) m a
forall (m :: * -> *) a r.
Monad m =>
a -> Stream (Of a) m r -> Stream (Of a) m r
Q.cons a
a (Stream (Of a) m a -> Stream (Of a) m a)
-> Stream (Of a) m a -> Stream (Of a) m a
forall a b. (a -> b) -> a -> b
$ Of a (Stream (Of a) m a) -> Stream (Of a) m a
step (a
b a -> Stream (Of a) m a -> Of a (Stream (Of a) m a)
forall a b. a -> b -> Of a b
:> Stream (Of a) m a
s)
{-# INLINE check_sort #-}

{- | Workhorse for duplicate removal.

 - Unmapped fragments should not be considered to be duplicates of
   mapped fragments.  The /unmapped/ flag can serve for that:  while
   there are two classes of /unmapped/ reads (those that are not mapped
   and those that are mapped to an invalid position), the two sets will
   always have different coordinates.  (Unfortunately, correct duplicate
   removal now relies on correct /unmapped/ and /mate unmapped/ flags,
   and we don't get them from unmodified BWA.  So correct operation
   requires patched BWA or a run of @bam-fixpair@.)

   (1) Other definitions (e.g. lack of CIGAR) don't work, because that
       information won't be available for the mate.

   (2) This would amount to making the /unmapped/ flag part of the
       coordinate, but samtools is not going to take it into account
       when sorting.

   (3) Instead, both flags become part of the /mate pos/ grouping
       criterion.

 - First Mates should (probably) not be considered duplicates of Second
   Mates.  This is unconditionally true for libraries with A\/B-style
   adapters (definitely 454, probably Mathias' ds protocol) and the ss
   protocol, it is not true for fork adapter protocols (vanilla Illumina
   protocol).  So it has to be an option, which would ideally be derived
   from header information.

 - This code ignores read groups, but it will do a majority vote on the
   @RG@ field and call consensi for the index sequences.  If you believe
   that duplicates across read groups are impossible, you must call it
   with an appropriately filtered stream.

 - Half-Aligned Pairs (meaning one known coordinate, while the validity
   of the alignments is immaterial) are rather complicated:

   (1) Given that only one coordinate is known (5' of the aligned mate),
       we want to treat them like true singles.  But the unaligned mate
       should be kept if possible, though it should not contribute to a
       consensus sequence.  We assume nothing about the unaligned mate,
       not even that it /shouldn't/ be aligned, never mind the fact that
       it /couldn't/ be.  (The difference is in the finite abilities of
       real world aligners, naturally.)

   (2) Therefore, aligned reads with unaligned mates go to the same
       potential duplicate set as true singletons.  If at least one pair
       exists that might be a duplicate of those, all singletons and
       half-aligned mates are discarded.  Else a consensus is computed
       and replaces the aligned mates.

   (3) The unaligned mates end up in the same place in a BAM stream as
       the aligned mates (therefore we see them and can treat them
       locally).  We cannot call a consensus, since these molecules may
       well have different length, so we select one.  It doesn't really
       matter which one is selected, and since we're treating both mates
       at the same time, it doesn't even need to be reproducible without
       local information.  This is made to be the mate of the consensus.

   (4) See 'merge_singles' for how it's actually done.
-}

do_rmdup :: Monad m => Bool -> Collapse m -> [BamRec] -> m [(Int,BamRec)]
do_rmdup :: Bool -> Collapse m -> [BamRec] -> m [(Int, BamRec)]
do_rmdup strand_preserved :: Bool
strand_preserved Collapse{..} rds :: [BamRec]
rds = do
    let (raw_pairs :: [BamRec]
raw_pairs, raw_singles :: [BamRec]
raw_singles)       = (BamRec -> Bool) -> [BamRec] -> ([BamRec], [BamRec])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition BamRec -> Bool
isPaired [BamRec]
rds
        (merged :: [BamRec]
merged, true_singles :: [BamRec]
true_singles)         = (BamRec -> Bool) -> [BamRec] -> ([BamRec], [BamRec])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition ((Bool -> Bool -> Bool)
-> (BamRec -> Bool) -> (BamRec -> Bool) -> BamRec -> Bool
forall (f :: * -> *) a b c.
Applicative f =>
(a -> b -> c) -> f a -> f b -> f c
liftA2 Bool -> Bool -> Bool
(||) BamRec -> Bool
isMerged BamRec -> Bool
isTrimmed) [BamRec]
raw_singles

        (pairs :: [BamRec]
pairs, raw_half_pairs :: [BamRec]
raw_half_pairs)        = (BamRec -> Bool) -> [BamRec] -> ([BamRec], [BamRec])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition BamRec -> Bool
b_totally_aligned [BamRec]
raw_pairs
        (half_unaligned :: [BamRec]
half_unaligned, half_aligned :: [BamRec]
half_aligned) = (BamRec -> Bool) -> [BamRec] -> ([BamRec], [BamRec])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition BamRec -> Bool
isUnmapped [BamRec]
raw_half_pairs

        unaligned' :: HashMap Bool [BamRec]
unaligned'                     = (BamRec -> Bool)
-> (BamRec -> BamRec) -> [BamRec] -> HashMap Bool [BamRec]
forall k a v.
(Hashable k, Eq k) =>
(a -> k) -> (a -> v) -> [a] -> HashMap k [v]
accumMap BamRec -> Bool
b_strand BamRec -> BamRec
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id [BamRec]
half_unaligned

    (pairs' :: HashMap ((Refseq, (Int, (Bool, Bool))), Bool, Bool) (Int, Decision)
pairs',r1 :: [BamRec]
r1)   <- (BamRec -> ((Refseq, (Int, (Bool, Bool))), Bool, Bool))
-> [BamRec]
-> m (HashMap
        ((Refseq, (Int, (Bool, Bool))), Bool, Bool) (Int, Decision),
      [BamRec])
forall k.
(Hashable k, Eq k) =>
(BamRec -> k)
-> [BamRec] -> m (HashMap k (Int, Decision), [BamRec])
mkMap (\b :: BamRec
b -> (BamRec -> (Refseq, (Int, (Bool, Bool)))
b_mate_pos BamRec
b, BamRec -> Bool
b_strand BamRec
b, BamRec -> Bool
b_mate BamRec
b))    [BamRec]
pairs
    (merged' :: HashMap (Int, Bool) (Int, Decision)
merged',r2 :: [BamRec]
r2)  <- (BamRec -> (Int, Bool))
-> [BamRec] -> m (HashMap (Int, Bool) (Int, Decision), [BamRec])
forall k.
(Hashable k, Eq k) =>
(BamRec -> k)
-> [BamRec] -> m (HashMap k (Int, Decision), [BamRec])
mkMap (\b :: BamRec
b -> (Vector Cigar -> Int
forall (v :: * -> *). Vector v Cigar => v Cigar -> Int
alignedLength (BamRec -> Vector Cigar
b_cigar BamRec
b), BamRec -> Bool
b_strand BamRec
b)) [BamRec]
merged
    (singles' :: HashMap Bool (Int, Decision)
singles',r3 :: [BamRec]
r3) <- (BamRec -> Bool)
-> [BamRec] -> m (HashMap Bool (Int, Decision), [BamRec])
forall k.
(Hashable k, Eq k) =>
(BamRec -> k)
-> [BamRec] -> m (HashMap k (Int, Decision), [BamRec])
mkMap (\b :: BamRec
b ->  BamRec -> Bool
b_strand BamRec
b)                            ([BamRec]
true_singles[BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++[BamRec]
half_aligned)

    let (results :: [(Int, BamRec)]
results, leftovers :: [BamRec]
leftovers) = HashMap Bool (Int, Decision)
-> HashMap Bool [BamRec]
-> [(Bool, (Int, BamRec))]
-> ([(Int, BamRec)], [BamRec])
merge_singles HashMap Bool (Int, Decision)
singles' HashMap Bool [BamRec]
unaligned' ([(Bool, (Int, BamRec))] -> ([(Int, BamRec)], [BamRec]))
-> [(Bool, (Int, BamRec))] -> ([(Int, BamRec)], [BamRec])
forall a b. (a -> b) -> a -> b
$
                [ (Bool
str, (Decision -> BamRec) -> (Int, Decision) -> (Int, BamRec)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second Decision -> BamRec
fromDecision (Int, Decision)
b) | ((_,str :: Bool
str  ),b :: (Int, Decision)
b) <- HashMap (Int, Bool) (Int, Decision)
-> [((Int, Bool), (Int, Decision))]
forall k v. HashMap k v -> [(k, v)]
M.toList HashMap (Int, Bool) (Int, Decision)
merged' ] [(Bool, (Int, BamRec))]
-> [(Bool, (Int, BamRec))] -> [(Bool, (Int, BamRec))]
forall a. [a] -> [a] -> [a]
++
                [ (Bool
str, (Decision -> BamRec) -> (Int, Decision) -> (Int, BamRec)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second Decision -> BamRec
fromDecision (Int, Decision)
b) | ((_,str :: Bool
str,_),b :: (Int, Decision)
b) <- HashMap ((Refseq, (Int, (Bool, Bool))), Bool, Bool) (Int, Decision)
-> [(((Refseq, (Int, (Bool, Bool))), Bool, Bool), (Int, Decision))]
forall k v. HashMap k v -> [(k, v)]
M.toList HashMap ((Refseq, (Int, (Bool, Bool))), Bool, Bool) (Int, Decision)
pairs' ]
    [(Int, BamRec)] -> m [(Int, BamRec)]
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([(Int, BamRec)] -> m [(Int, BamRec)])
-> [(Int, BamRec)] -> m [(Int, BamRec)]
forall a b. (a -> b) -> a -> b
$ [(Int, BamRec)]
results [(Int, BamRec)] -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. [a] -> [a] -> [a]
++ (BamRec -> (Int, BamRec)) -> [BamRec] -> [(Int, BamRec)]
forall a b. (a -> b) -> [a] -> [b]
map ((,) 0) ([BamRec] -> [BamRec]
originals ([BamRec]
leftovers [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
r1 [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
r2 [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
r3))
  where
    mkMap :: (BamRec -> k)
-> [BamRec] -> m (HashMap k (Int, Decision), [BamRec])
mkMap f :: BamRec -> k
f x :: [BamRec]
x = do HashMap k (Int, (Decision, [BamRec]))
m1 <- ([BamRec] -> m (Int, (Decision, [BamRec])))
-> HashMap k [BamRec] -> m (HashMap k (Int, (Decision, [BamRec])))
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
traverse (\xs :: [BamRec]
xs -> (,) ([BamRec] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [BamRec]
xs) ((Decision, [BamRec]) -> (Int, (Decision, [BamRec])))
-> m (Decision, [BamRec]) -> m (Int, (Decision, [BamRec]))
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [BamRec] -> m (Decision, [BamRec])
collapse [BamRec]
xs) (HashMap k [BamRec] -> m (HashMap k (Int, (Decision, [BamRec]))))
-> HashMap k [BamRec] -> m (HashMap k (Int, (Decision, [BamRec])))
forall a b. (a -> b) -> a -> b
$ (BamRec -> k)
-> (BamRec -> BamRec) -> [BamRec] -> HashMap k [BamRec]
forall k a v.
(Hashable k, Eq k) =>
(a -> k) -> (a -> v) -> [a] -> HashMap k [v]
accumMap BamRec -> k
f BamRec -> BamRec
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id [BamRec]
x
                   (HashMap k (Int, Decision), [BamRec])
-> m (HashMap k (Int, Decision), [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (((Int, (Decision, [BamRec])) -> (Int, Decision))
-> HashMap k (Int, (Decision, [BamRec]))
-> HashMap k (Int, Decision)
forall v1 v2 k. (v1 -> v2) -> HashMap k v1 -> HashMap k v2
M.map (((Decision, [BamRec]) -> Decision)
-> (Int, (Decision, [BamRec])) -> (Int, Decision)
forall (p :: * -> * -> *) b c a.
Bifunctor p =>
(b -> c) -> p a b -> p a c
second (Decision, [BamRec]) -> Decision
forall a b. (a, b) -> a
fst) HashMap k (Int, (Decision, [BamRec]))
m1, ((Int, (Decision, [BamRec])) -> [BamRec])
-> [(Int, (Decision, [BamRec]))] -> [BamRec]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((Decision, [BamRec]) -> [BamRec]
forall a b. (a, b) -> b
snd((Decision, [BamRec]) -> [BamRec])
-> ((Int, (Decision, [BamRec])) -> (Decision, [BamRec]))
-> (Int, (Decision, [BamRec]))
-> [BamRec]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
.(Int, (Decision, [BamRec])) -> (Decision, [BamRec])
forall a b. (a, b) -> b
snd) ([(Int, (Decision, [BamRec]))] -> [BamRec])
-> [(Int, (Decision, [BamRec]))] -> [BamRec]
forall a b. (a -> b) -> a -> b
$ HashMap k (Int, (Decision, [BamRec]))
-> [(Int, (Decision, [BamRec]))]
forall k v. HashMap k v -> [v]
M.elems HashMap k (Int, (Decision, [BamRec]))
m1)

    b_strand :: BamRec -> Bool
b_strand b :: BamRec
b = Bool
strand_preserved Bool -> Bool -> Bool
&& BamRec -> Bool
isReversed  BamRec
b
    b_mate :: BamRec -> Bool
b_mate   b :: BamRec
b = Bool
strand_preserved Bool -> Bool -> Bool
&& BamRec -> Bool
isFirstMate BamRec
b
    b_mate_pos :: BamRec -> (Refseq, (Int, (Bool, Bool)))
b_mate_pos = BamRec -> Refseq
b_mrnm (BamRec -> Refseq)
-> (BamRec -> (Int, (Bool, Bool)))
-> BamRec
-> (Refseq, (Int, (Bool, Bool)))
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& BamRec -> Int
b_mpos (BamRec -> Int)
-> (BamRec -> (Bool, Bool)) -> BamRec -> (Int, (Bool, Bool))
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& BamRec -> Bool
isUnmapped (BamRec -> Bool) -> (BamRec -> Bool) -> BamRec -> (Bool, Bool)
forall (a :: * -> * -> *) b c c'.
Arrow a =>
a b c -> a b c' -> a b (c, c')
&&& BamRec -> Bool
isMateUnmapped
    b_totally_aligned :: BamRec -> Bool
b_totally_aligned  = Bool -> Bool -> Bool
(&&) (Bool -> Bool -> Bool)
-> (BamRec -> Bool) -> BamRec -> Bool -> Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Bool -> Bool
not (Bool -> Bool) -> (BamRec -> Bool) -> BamRec -> Bool
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Bool
isUnmapped (BamRec -> Bool -> Bool) -> (BamRec -> Bool) -> BamRec -> Bool
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Bool -> Bool
not (Bool -> Bool) -> (BamRec -> Bool) -> BamRec -> Bool
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Bool
isMateUnmapped




-- | Merging information about true singles, merged singles,
-- half-aligned pairs, actually aligned pairs.
--
-- We collected aligned reads with unaligned mates together with aligned
-- true singles (@singles@).  We collected the unaligned mates, which
-- necessarily have the exact same alignment coordinates, separately
-- (@unaligned@).  If we don't find a matching true pair (that case is
-- already handled smoothly), we keep the highest quality unaligned
-- mate, pair it with the consensus of the aligned mates and aligned
-- singletons, and give it the lexically smallest name of the
-- half-aligned pairs.

-- NOTE:  I need to decide when to run 'make_singleton'.  Basically,
-- when we call a consensus for half-aligned pairs and keep
-- everything(?).  Then we don't have a mate for the consensus... though
-- we could decide to duplicate one mate read to get it.

merge_singles :: M.HashMap Bool (Int,Decision)          -- strand --> true singles & half aligned
              -> M.HashMap Bool [BamRec]                -- strand --> half unaligned
              -> [ (Bool, (Int, BamRec)) ]              -- strand --> paireds & mergeds
              -> ([(Int,BamRec)],[BamRec])              -- results, leftovers

merge_singles :: HashMap Bool (Int, Decision)
-> HashMap Bool [BamRec]
-> [(Bool, (Int, BamRec))]
-> ([(Int, BamRec)], [BamRec])
merge_singles singles :: HashMap Bool (Int, Decision)
singles unaligneds :: HashMap Bool [BamRec]
unaligneds = [(Bool, (Int, BamRec))] -> ([(Int, BamRec)], [BamRec])
go
  where
    -- Say we generated a consensus or passed something through.  If
    -- there is a singleton consensus with the same strand, we should
    -- add in its XP field and discard it.  If there is a singleton
    -- representative, we add in its XP field and put it into the
    -- leftovers.  If there is unaligned stuff here that has the same
    -- strand, it goes to the leftovers.
    go :: [(Bool, (Int, BamRec))] -> ([(Int, BamRec)], [BamRec])
go ( (str :: Bool
str, (m :: Int
m,v :: BamRec
v)) : paireds :: [(Bool, (Int, BamRec))]
paireds) =
        let (r :: [(Int, BamRec)]
r,l :: [BamRec]
l) = HashMap Bool (Int, Decision)
-> HashMap Bool [BamRec]
-> [(Bool, (Int, BamRec))]
-> ([(Int, BamRec)], [BamRec])
merge_singles (Bool
-> HashMap Bool (Int, Decision) -> HashMap Bool (Int, Decision)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v
M.delete Bool
str HashMap Bool (Int, Decision)
singles) (Bool -> HashMap Bool [BamRec] -> HashMap Bool [BamRec]
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v
M.delete Bool
str HashMap Bool [BamRec]
unaligneds) [(Bool, (Int, BamRec))]
paireds
            unal :: [BamRec]
unal  = [BamRec] -> Bool -> HashMap Bool [BamRec] -> [BamRec]
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault [] Bool
str HashMap Bool [BamRec]
unaligneds [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
l

        in case Bool -> HashMap Bool (Int, Decision) -> Maybe (Int, Decision)
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> Maybe v
M.lookup Bool
str HashMap Bool (Int, Decision)
singles of
            Nothing                    -> (              (Int
m,BamRec
v) (Int, BamRec) -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. a -> [a] -> [a]
: [(Int, BamRec)]
r,     [BamRec]
unal )
            Just (n :: Int
n, Consensus      w :: BamRec
w) -> ( (Int
n, BamRec -> BamRec -> BamRec
add_xp_of BamRec
w BamRec
v) (Int, BamRec) -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. a -> [a] -> [a]
: [(Int, BamRec)]
r,     [BamRec]
unal )
            Just (n :: Int
n, Representative w :: BamRec
w) -> ( (Int
n, BamRec -> BamRec -> BamRec
add_xp_of BamRec
w BamRec
v) (Int, BamRec) -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. a -> [a] -> [a]
: [(Int, BamRec)]
r, BamRec
w BamRec -> [BamRec] -> [BamRec]
forall a. a -> [a] -> [a]
: [BamRec]
unal )

    -- No more pairs, delegate the problem
    go [] = HashMap Bool [BamRec]
-> [(Bool, (Int, Decision))] -> ([(Int, BamRec)], [BamRec])
merge_halves HashMap Bool [BamRec]
unaligneds (HashMap Bool (Int, Decision) -> [(Bool, (Int, Decision))]
forall k v. HashMap k v -> [(k, v)]
M.toList HashMap Bool (Int, Decision)
singles)

    add_xp_of :: BamRec -> BamRec -> BamRec
add_xp_of w :: BamRec
w v :: BamRec
v = BamRec
v { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "XP" (Int -> Ext
Int (Int -> Ext) -> Int -> Ext
forall a b. (a -> b) -> a -> b
$ Int -> BamKey -> BamRec -> Int
extAsInt 1 "XP" BamRec
w Int -> Int -> Int
`oplus` Int -> BamKey -> BamRec -> Int
extAsInt 1 "XP" BamRec
v) (BamRec -> Extensions
b_exts BamRec
v) }

-- | Merging of half-aligned reads.  The first argument is a map of
-- unaligned reads (their mates are aligned to the current position),
-- the second is a list of reads that are aligned (their mates are not
-- aligned).
--
-- So, suppose we're looking at a 'Representative' that was passed
-- through.  We need to emit it along with its mate, which may be hidden
-- inside a list.  (Alternatively, we could force it to single, but that
-- fails if we're passing everything along somehow.)
--
-- Suppose we're looking at a 'Consensus'.  We could pair it with some
-- mate (which we'd need to duplicate), or we could turn it into a
-- singleton.  Duplication is ugly, so in this case, we force it to
-- singleton.

merge_halves :: M.HashMap Bool [BamRec]                 -- strand --> half unaligned
             -> [(Bool, (Int,Decision))]                -- strand --> true singles & half aligned
             -> ([(Int,BamRec)],[BamRec])               -- results, leftovers

-- Emitting a consensus: make it a single.  Nothing goes to leftovers;
-- we may still need it for something else to be emitted.  (While that
-- would be strange, making sure the BAM file stays completely valid is
-- probably better.)
merge_halves :: HashMap Bool [BamRec]
-> [(Bool, (Int, Decision))] -> ([(Int, BamRec)], [BamRec])
merge_halves unaligneds :: HashMap Bool [BamRec]
unaligneds ((_, (n :: Int
n, Consensus v :: BamRec
v)) : singles :: [(Bool, (Int, Decision))]
singles) =
    ( (Int
n, BamRec
v { b_flag :: Int
b_flag = BamRec -> Int
b_flag BamRec
v Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement Int
pflags }) (Int, BamRec) -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. a -> [a] -> [a]
: [(Int, BamRec)]
r, [BamRec]
l )
  where
    (r :: [(Int, BamRec)]
r,l :: [BamRec]
l)  = HashMap Bool [BamRec]
-> [(Bool, (Int, Decision))] -> ([(Int, BamRec)], [BamRec])
merge_halves HashMap Bool [BamRec]
unaligneds [(Bool, (Int, Decision))]
singles
    pflags :: Int
pflags = Int
flagPaired Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagProperlyPaired Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagMateUnmapped Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagMateReversed Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFirstMate Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagSecondMate


-- Emitting a representative:  find the mate in the list of unaligned
-- reads (take up to one match to be robust), and emit that, too, as a
-- result.  Everything else goes to leftovers.  If the representative
-- happens to be unpaired, no mate is found and that case therefore is
-- handled smoothly.
merge_halves unaligneds :: HashMap Bool [BamRec]
unaligneds ((str :: Bool
str, (n :: Int
n, Representative v :: BamRec
v)) : singles :: [(Bool, (Int, Decision))]
singles) =
    ((Int
n,BamRec
v) (Int, BamRec) -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. a -> [a] -> [a]
: (BamRec -> (Int, BamRec)) -> [BamRec] -> [(Int, BamRec)]
forall a b. (a -> b) -> [a] -> [b]
map ((,)1) (Int -> [BamRec] -> [BamRec]
forall a. Int -> [a] -> [a]
take 1 [BamRec]
same) [(Int, BamRec)] -> [(Int, BamRec)] -> [(Int, BamRec)]
forall a. [a] -> [a] -> [a]
++ [(Int, BamRec)]
r, Int -> [BamRec] -> [BamRec]
forall a. Int -> [a] -> [a]
drop 1 [BamRec]
same [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
diff [BamRec] -> [BamRec] -> [BamRec]
forall a. [a] -> [a] -> [a]
++ [BamRec]
l)
  where
    (r :: [(Int, BamRec)]
r,l :: [BamRec]
l)          = HashMap Bool [BamRec]
-> [(Bool, (Int, Decision))] -> ([(Int, BamRec)], [BamRec])
merge_halves (Bool -> HashMap Bool [BamRec] -> HashMap Bool [BamRec]
forall k v. (Eq k, Hashable k) => k -> HashMap k v -> HashMap k v
M.delete Bool
str HashMap Bool [BamRec]
unaligneds) [(Bool, (Int, Decision))]
singles
    (same :: [BamRec]
same,diff :: [BamRec]
diff)    = (BamRec -> Bool) -> [BamRec] -> ([BamRec], [BamRec])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition (BamRec -> BamRec -> Bool
is_mate_of BamRec
v) ([BamRec] -> ([BamRec], [BamRec]))
-> [BamRec] -> ([BamRec], [BamRec])
forall a b. (a -> b) -> a -> b
$ [BamRec] -> Bool -> HashMap Bool [BamRec] -> [BamRec]
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault [] Bool
str HashMap Bool [BamRec]
unaligneds
    is_mate_of :: BamRec -> BamRec -> Bool
is_mate_of a :: BamRec
a b :: BamRec
b = BamRec -> Bytes
b_qname BamRec
a Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Bytes
b_qname BamRec
b Bool -> Bool -> Bool
&& BamRec -> Bool
isPaired BamRec
a Bool -> Bool -> Bool
&& BamRec -> Bool
isPaired BamRec
b Bool -> Bool -> Bool
&& BamRec -> Bool
isFirstMate BamRec
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Bool
isSecondMate BamRec
b

-- No more singles, all unaligneds are leftovers.
merge_halves unaligneds :: HashMap Bool [BamRec]
unaligneds [] =
    ( [], [[BamRec]] -> [BamRec]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat ([[BamRec]] -> [BamRec]) -> [[BamRec]] -> [BamRec]
forall a b. (a -> b) -> a -> b
$ HashMap Bool [BamRec] -> [[BamRec]]
forall k v. HashMap k v -> [v]
M.elems HashMap Bool [BamRec]
unaligneds )


accumMap :: (Hashable k, Eq k) => (a -> k) -> (a -> v) -> [a] -> M.HashMap k [v]
accumMap :: (a -> k) -> (a -> v) -> [a] -> HashMap k [v]
accumMap f :: a -> k
f g :: a -> v
g = HashMap k [v] -> [a] -> HashMap k [v]
go HashMap k [v]
forall k v. HashMap k v
M.empty
  where
    go :: HashMap k [v] -> [a] -> HashMap k [v]
go m :: HashMap k [v]
m [    ] = HashMap k [v]
m
    go m :: HashMap k [v]
m (a :: a
a:as :: [a]
as) = let ws :: [v]
ws = [v] -> k -> HashMap k [v] -> [v]
forall k v. (Eq k, Hashable k) => v -> k -> HashMap k v -> v
M.lookupDefault [] (a -> k
f a
a) HashMap k [v]
m ; g' :: v
g' = a -> v
g a
a
                  in v
g' v -> HashMap k [v] -> HashMap k [v]
forall a b. a -> b -> b
`seq` HashMap k [v] -> [a] -> HashMap k [v]
go (k -> [v] -> HashMap k [v] -> HashMap k [v]
forall k v.
(Eq k, Hashable k) =>
k -> v -> HashMap k v -> HashMap k v
M.insert (a -> k
f a
a) (v
g'v -> [v] -> [v]
forall a. a -> [a] -> [a]
:[v]
ws) HashMap k [v]
m) [a]
as


{- We need to deal sensibly with each field, but different fields have
   different needs.  We can take the value from the first read to
   preserve determinism or because all reads should be equal anyway,
   aggregate over all reads computing either RMSQ or the most common
   value, delete a field because it wouldn't make sense anymore or
   because doing something sensible would be hard and we're going to
   ignore it anyway, or we calculate some special value; see below.
   Unknown fields will be taken from the first read, which seems to be a
   safe default.

   QNAME and most fields              taken from first
   FLAG qc fail                       majority vote
        dup                           deleted
   MAPQ                               rmsq
   CIGAR, SEQ, QUAL, MD, NM, XP       generated
   XA                                 concatenate all

   BQ, CM, FZ, Q2, R2, XM, XO, XG, YQ, EN
         deleted because they would become wrong

   CQ, CS, E2, FS, OQ, OP, OC, U2, H0, H1, H2, HI, NH, IH, ZQ
         delete because they will be ignored anyway

   AM, AS, MQ, PQ, SM, UQ
         compute rmsq

   X0, X1, XT, XS, XF, XE, BC, LB, RG, XI, YI, XJ, YJ
         majority vote -}

do_collapse :: MonadLog m => Qual -> [BamRec] -> m (Decision, [BamRec])
do_collapse :: Qual -> [BamRec] -> m (Decision, [BamRec])
do_collapse maxq :: Qual
maxq [br :: BamRec
br]
    | Bool -> (Vector Qual -> Bool) -> Maybe (Vector Qual) -> Bool
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Bool
True ((Qual -> Bool) -> Vector Qual -> Bool
forall (v :: * -> *) a. Vector v a => (a -> Bool) -> v a -> Bool
V.all (Qual -> Qual -> Bool
forall a. Ord a => a -> a -> Bool
<= Qual
maxq)) (BamRec -> Maybe (Vector Qual)
b_qual BamRec
br) = (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Representative BamRec
br, [  ] ) -- no modification, pass through
    | Bool
otherwise                                = (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Consensus   BamRec
lq_br, [BamRec
br] ) -- qualities reduced, must keep original
  where
    lq_br :: BamRec
lq_br = BamRec
br { b_qual :: Maybe (Vector Qual)
b_qual  = (Qual -> Qual) -> Vector Qual -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Qual -> Qual -> Qual
forall a. Ord a => a -> a -> a
min Qual
maxq) (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
br
               , b_virtual_offset :: Int64
b_virtual_offset = 0
               , b_qname :: Bytes
b_qname = BamRec -> Bytes
b_qname BamRec
br Bytes -> Word8 -> Bytes
`B.snoc` Char -> Word8
c2w 'c' }

do_collapse maxq :: Qual
maxq  brs :: [BamRec]
brs = do
    Maybe BrokenMD -> (BrokenMD -> m ()) -> m ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
t a -> (a -> m b) -> m ()
forM_ Maybe BrokenMD
ws ((BrokenMD -> m ()) -> m ()) -> (BrokenMD -> m ()) -> m ()
forall a b. (a -> b) -> a -> b
$ Level -> BrokenMD -> m ()
forall (m :: * -> *) e.
(MonadLog m, Exception e) =>
Level -> e -> m ()
logMsg Level
Warning
    (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Consensus BamRec
b0 { b_exts :: Extensions
b_exts  = Extensions -> Extensions
modify_extensions (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
b0
                        , b_flag :: Int
b_flag  = Int
failflag Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement Int
flagDuplicate
                        , b_mapq :: Qual
b_mapq  = Word8 -> Qual
Q (Word8 -> Qual) -> Word8 -> Qual
forall a b. (a -> b) -> a -> b
$ [Word8] -> Word8
forall (t :: * -> *) a b.
(Foldable t, Integral a, Integral b) =>
t a -> b
rmsq ([Word8] -> Word8) -> [Word8] -> Word8
forall a b. (a -> b) -> a -> b
$ (BamRec -> Word8) -> [BamRec] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map (Qual -> Word8
unQ (Qual -> Word8) -> (BamRec -> Qual) -> BamRec -> Word8
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Qual
b_mapq) ([BamRec] -> [Word8]) -> [BamRec] -> [Word8]
forall a b. (a -> b) -> a -> b
$ [BamRec] -> [BamRec]
good [BamRec]
brs
                        , b_cigar :: Vector Cigar
b_cigar = Vector Cigar
cigar'
                        , b_seq :: Vector_Nucs_half Nucleotides
b_seq   = [Nucleotides] -> Vector_Nucs_half Nucleotides
forall (v :: * -> *) a. Vector v a => [a] -> v a
V.fromList ([Nucleotides] -> Vector_Nucs_half Nucleotides)
-> [Nucleotides] -> Vector_Nucs_half Nucleotides
forall a b. (a -> b) -> a -> b
$ ((Nucleotides, Qual) -> Nucleotides)
-> [(Nucleotides, Qual)] -> [Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map (Nucleotides, Qual) -> Nucleotides
forall a b. (a, b) -> a
fst [(Nucleotides, Qual)]
cons_seq_qual
                        , b_qual :: Maybe (Vector Qual)
b_qual  = Vector Qual -> Maybe (Vector Qual)
forall a. a -> Maybe a
Just (Vector Qual -> Maybe (Vector Qual))
-> Vector Qual -> Maybe (Vector Qual)
forall a b. (a -> b) -> a -> b
$ [Qual] -> Vector Qual
forall (v :: * -> *) a. Vector v a => [a] -> v a
V.fromList ([Qual] -> Vector Qual) -> [Qual] -> Vector Qual
forall a b. (a -> b) -> a -> b
$ ((Nucleotides, Qual) -> Qual) -> [(Nucleotides, Qual)] -> [Qual]
forall a b. (a -> b) -> [a] -> [b]
map (Nucleotides, Qual) -> Qual
forall a b. (a, b) -> b
snd [(Nucleotides, Qual)]
cons_seq_qual
                        , b_qname :: Bytes
b_qname = BamRec -> Bytes
b_qname BamRec
b0 Bytes -> Word8 -> Bytes
`B.snoc` 99
                        , b_virtual_offset :: Int64
b_virtual_offset = 0 }, [BamRec]
brs )        -- many modifications, must keep everything
  where
    !b0 :: BamRec
b0 = (BamRec -> BamRec -> Ordering) -> [BamRec] -> BamRec
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
minimumBy ((BamRec -> Bytes) -> BamRec -> BamRec -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing BamRec -> Bytes
b_qname) [BamRec]
brs
    !most_fail :: Bool
most_fail = 2 Int -> Int -> Int
forall a. Num a => a -> a -> a
* [BamRec] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ((BamRec -> Bool) -> [BamRec] -> [BamRec]
forall a. (a -> Bool) -> [a] -> [a]
filter BamRec -> Bool
isFailsQC [BamRec]
brs) Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> [BamRec] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [BamRec]
brs
    !failflag :: Int
failflag | Bool
most_fail = BamRec -> Int
b_flag BamRec
b0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
flagFailsQC
              | Bool
otherwise = BamRec -> Int
b_flag BamRec
b0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int -> Int
forall a. Bits a => a -> a
complement Int
flagFailsQC

    rmsq :: t a -> b
rmsq xs :: t a
xs = case ((Double, Int) -> a -> (Double, Int))
-> (Double, Int) -> t a -> (Double, Int)
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\(!Double
n,!Int
d) x :: a
x -> (Double
n Double -> Double -> Double
forall a. Num a => a -> a -> a
+ a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
x, Int
d Int -> Int -> Int
forall a. Num a => a -> a -> a
+ 1)) (0,0) t a
xs of
        (!Double
n,!Int
d) -> Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double -> b) -> Double -> b
forall a b. (a -> b) -> a -> b
$ Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> Double -> Double
forall a b. (a -> b) -> a -> b
$ (Double
n::Double) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int
d::Int)

    maj :: [t] -> t
maj xs :: [t]
xs = [t] -> t
forall a. [a] -> a
head ([t] -> t) -> ([t] -> [t]) -> [t] -> t
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ([t] -> [t] -> Ordering) -> [[t]] -> [t]
forall (t :: * -> *) a.
Foldable t =>
(a -> a -> Ordering) -> t a -> a
maximumBy (([t] -> Int) -> [t] -> [t] -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing [t] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length) ([[t]] -> [t]) -> ([t] -> [[t]]) -> [t] -> [t]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [t] -> [[t]]
forall a. Eq a => [a] -> [[a]]
group ([t] -> [[t]]) -> ([t] -> [t]) -> [t] -> [[t]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [t] -> [t]
forall a. Ord a => [a] -> [a]
sort ([t] -> t) -> [t] -> t
forall a b. (a -> b) -> a -> b
$ [t]
xs
    nub' :: [[Bytes]] -> [Bytes]
nub' = ([[Bytes]] -> [Bytes]) -> [[[Bytes]]] -> [Bytes]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap [[Bytes]] -> [Bytes]
forall a. [a] -> a
head ([[[Bytes]]] -> [Bytes])
-> ([[Bytes]] -> [[[Bytes]]]) -> [[Bytes]] -> [Bytes]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [[Bytes]] -> [[[Bytes]]]
forall a. Eq a => [a] -> [[a]]
group ([[Bytes]] -> [[[Bytes]]])
-> ([[Bytes]] -> [[Bytes]]) -> [[Bytes]] -> [[[Bytes]]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [[Bytes]] -> [[Bytes]]
forall a. Ord a => [a] -> [a]
sort

    -- majority vote on the cigar lines, then filter
    !cigar' :: Vector Cigar
cigar' = [Vector Cigar] -> Vector Cigar
forall t. Ord t => [t] -> t
maj ([Vector Cigar] -> Vector Cigar) -> [Vector Cigar] -> Vector Cigar
forall a b. (a -> b) -> a -> b
$ (BamRec -> Vector Cigar) -> [BamRec] -> [Vector Cigar]
forall a b. (a -> b) -> [a] -> [b]
map BamRec -> Vector Cigar
b_cigar [BamRec]
brs
    good :: [BamRec] -> [BamRec]
good = (BamRec -> Bool) -> [BamRec] -> [BamRec]
forall a. (a -> Bool) -> [a] -> [a]
filter (Vector Cigar -> Vector Cigar -> Bool
forall a. Eq a => a -> a -> Bool
(==) Vector Cigar
cigar' (Vector Cigar -> Bool)
-> (BamRec -> Vector Cigar) -> BamRec -> Bool
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Vector Cigar
b_cigar)

    cons_seq_qual :: [(Nucleotides, Qual)]
cons_seq_qual = [ Qual -> [(Nucleotides, Qual)] -> (Nucleotides, Qual)
consensus Qual
maxq [ (Vector_Nucs_half Nucleotides -> Int -> Nucleotides
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
V.unsafeIndex (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b) Int
i, Qual
q)
                                     | BamRec
b <- [BamRec] -> [BamRec]
good [BamRec]
brs
                                     , let q :: Qual
q = Qual -> (Vector Qual -> Qual) -> Maybe (Vector Qual) -> Qual
forall b a. b -> (a -> b) -> Maybe a -> b
maybe (Word8 -> Qual
Q 23) (Vector Qual -> Int -> Qual
forall (v :: * -> *) a. Vector v a => v a -> Int -> a
V.! Int
i) (BamRec -> Maybe (Vector Qual)
b_qual BamRec
b) ]
                    | Int
i <- [0 .. Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- 1] ]
        where !len :: Int
len = Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (Vector_Nucs_half Nucleotides -> Int)
-> ([BamRec] -> Vector_Nucs_half Nucleotides) -> [BamRec] -> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Vector_Nucs_half Nucleotides
b_seq (BamRec -> Vector_Nucs_half Nucleotides)
-> ([BamRec] -> BamRec) -> [BamRec] -> Vector_Nucs_half Nucleotides
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [BamRec] -> BamRec
forall a. [a] -> a
head ([BamRec] -> Int) -> [BamRec] -> Int
forall a b. (a -> b) -> a -> b
$ [BamRec] -> [BamRec]
good [BamRec]
brs

    (md' :: [MdOp]
md',ws :: Maybe BrokenMD
ws) = case [ (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b,[MdOp]
md,BamRec
b) | BamRec
b <- [BamRec] -> [BamRec]
good [BamRec]
brs, Just md :: [MdOp]
md <- [ BamRec -> Maybe [MdOp]
getMd BamRec
b ] ] of
                [               ] -> ([],Maybe BrokenMD
forall a. Maybe a
Nothing)
                (seq1 :: Vector_Nucs_half Nucleotides
seq1, md1 :: [MdOp]
md1,b :: BamRec
b) : _ -> case [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [] (Vector Cigar -> [Cigar]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList Vector Cigar
cigar') [MdOp]
md1 (Vector_Nucs_half Nucleotides -> [Nucleotides]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList Vector_Nucs_half Nucleotides
seq1) (((Nucleotides, Qual) -> Nucleotides)
-> [(Nucleotides, Qual)] -> [Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map (Nucleotides, Qual) -> Nucleotides
forall a b. (a, b) -> a
fst [(Nucleotides, Qual)]
cons_seq_qual) of
                    Right x :: [MdOp]
x -> ([MdOp]
x,Maybe BrokenMD
forall a. Maybe a
Nothing)
                    Left (MdFail cigs :: [Cigar]
cigs ms :: [MdOp]
ms osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq) -> ([],BrokenMD -> Maybe BrokenMD
forall a. a -> Maybe a
Just BrokenMD
e)
                        where e :: BrokenMD
e = Bytes
-> Refseq
-> Int
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> BrokenMD
BrokenMD (BamRec -> Bytes
b_qname BamRec
b) (BamRec -> Refseq
b_rname BamRec
b) (BamRec -> Int
b_pos BamRec
b) [Cigar]
cigs [MdOp]
ms [Nucleotides]
osq [Nucleotides]
nsq

    nm' :: Int
nm' = [Int] -> Int
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
sum ([Int] -> Int) -> [Int] -> Int
forall a b. (a -> b) -> a -> b
$ [ Int
n | Ins :* n :: Int
n <- Vector Cigar -> [Cigar]
forall a. Storable a => Vector a -> [a]
W.toList Vector Cigar
cigar' ] [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
++ [ Int
n | Del :* n :: Int
n <- Vector Cigar -> [Cigar]
forall a. Storable a => Vector a -> [a]
W.toList Vector Cigar
cigar' ] [Int] -> [Int] -> [Int]
forall a. [a] -> [a] -> [a]
++ [ 1 | MdRep _ <- [MdOp]
md' ]
    xa' :: [Bytes]
xa' = [[Bytes]] -> [Bytes]
nub' [ Char -> Bytes -> [Bytes]
T.split ';' Bytes
xas | Just (Text xas :: Bytes
xas) <- (BamRec -> Maybe Ext) -> [BamRec] -> [Maybe Ext]
forall a b. (a -> b) -> [a] -> [b]
map (BamKey -> Extensions -> Maybe Ext
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup "XA" (Extensions -> Maybe Ext)
-> (BamRec -> Extensions) -> BamRec -> Maybe Ext
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Extensions
b_exts) [BamRec]
brs ]

    modify_extensions :: Extensions -> Extensions
modify_extensions es :: Extensions
es = ((Extensions -> Extensions) -> Extensions -> Extensions)
-> Extensions -> [Extensions -> Extensions] -> Extensions
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
($!) Extensions
es ([Extensions -> Extensions] -> Extensions)
-> [Extensions -> Extensions] -> Extensions
forall a b. (a -> b) -> a -> b
$
        [ let vs :: [Ext]
vs = (BamRec -> Maybe Ext) -> [BamRec] -> [Ext]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe (BamKey -> Extensions -> Maybe Ext
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup BamKey
k (Extensions -> Maybe Ext)
-> (BamRec -> Extensions) -> BamRec -> Maybe Ext
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Extensions
b_exts) [BamRec]
brs
          in if [Ext] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Ext]
vs then Extensions -> Extensions
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else BamKey -> Ext -> Extensions -> Extensions
updateE BamKey
k (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! [Ext] -> Ext
forall t. Ord t => [t] -> t
maj [Ext]
vs | BamKey
k <- [BamKey]
do_maj ] [Extensions -> Extensions]
-> [Extensions -> Extensions] -> [Extensions -> Extensions]
forall a. [a] -> [a] -> [a]
++
        [ let vs :: [Int]
vs = [ Int
v | Just (Int v :: Int
v) <- (BamRec -> Maybe Ext) -> [BamRec] -> [Maybe Ext]
forall a b. (a -> b) -> [a] -> [b]
map (BamKey -> Extensions -> Maybe Ext
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup BamKey
k (Extensions -> Maybe Ext)
-> (BamRec -> Extensions) -> BamRec -> Maybe Ext
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. BamRec -> Extensions
b_exts) [BamRec]
brs ]
          in if [Int] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Int]
vs then Extensions -> Extensions
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else BamKey -> Ext -> Extensions -> Extensions
updateE BamKey
k (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! Int -> Ext
Int ([Int] -> Int
forall (t :: * -> *) a b.
(Foldable t, Integral a, Integral b) =>
t a -> b
rmsq [Int]
vs) | BamKey
k <- [BamKey]
do_rmsq ] [Extensions -> Extensions]
-> [Extensions -> Extensions] -> [Extensions -> Extensions]
forall a. [a] -> [a] -> [a]
++
        (BamKey -> Extensions -> Extensions)
-> [BamKey] -> [Extensions -> Extensions]
forall a b. (a -> b) -> [a] -> [b]
map BamKey -> Extensions -> Extensions
deleteE [BamKey]
useless [Extensions -> Extensions]
-> [Extensions -> Extensions] -> [Extensions -> Extensions]
forall a. [a] -> [a] -> [a]
++
        [ BamKey -> Ext -> Extensions -> Extensions
updateE "NM" (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! Int -> Ext
Int Int
nm'
        , BamKey -> Ext -> Extensions -> Extensions
updateE "XP" (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! Int -> Ext
Int ((Int -> BamRec -> Int) -> Int -> [BamRec] -> Int
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\a :: Int
a b :: BamRec
b -> Int
a Int -> Int -> Int
`oplus` Int -> BamKey -> BamRec -> Int
extAsInt 1 "XP" BamRec
b) 0 [BamRec]
brs)
        , if [Bytes] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [Bytes]
xa' then Extensions -> Extensions
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else BamKey -> Ext -> Extensions -> Extensions
updateE "XA" (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! (Bytes -> Ext
Text (Bytes -> Ext) -> Bytes -> Ext
forall a b. (a -> b) -> a -> b
$ Bytes -> [Bytes] -> Bytes
T.intercalate (Char -> Bytes
T.singleton ';') [Bytes]
xa')
        , if [MdOp] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [MdOp]
md' then Extensions -> Extensions
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else BamKey -> Ext -> Extensions -> Extensions
updateE "MD" (Ext -> Extensions -> Extensions)
-> Ext -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$! (Bytes -> Ext
Text (Bytes -> Ext) -> Bytes -> Ext
forall a b. (a -> b) -> a -> b
$ [MdOp] -> Bytes
showMd [MdOp]
md') ]

    useless :: [BamKey]
useless = (String -> BamKey) -> [String] -> [BamKey]
forall a b. (a -> b) -> [a] -> [b]
map String -> BamKey
forall a. IsString a => String -> a
fromString ([String] -> [BamKey]) -> [String] -> [BamKey]
forall a b. (a -> b) -> a -> b
$ String -> [String]
words "BQ CM FZ Q2 R2 XM XO XG YQ EN CQ CS E2 FS OQ OP OC U2 H0 H1 H2 HI NH IH ZQ"
    do_rmsq :: [BamKey]
do_rmsq = (String -> BamKey) -> [String] -> [BamKey]
forall a b. (a -> b) -> [a] -> [b]
map String -> BamKey
forall a. IsString a => String -> a
fromString ([String] -> [BamKey]) -> [String] -> [BamKey]
forall a b. (a -> b) -> a -> b
$ String -> [String]
words "AM AS MQ PQ SM UQ"
    do_maj :: [BamKey]
do_maj  = (String -> BamKey) -> [String] -> [BamKey]
forall a b. (a -> b) -> [a] -> [b]
map String -> BamKey
forall a. IsString a => String -> a
fromString ([String] -> [BamKey]) -> [String] -> [BamKey]
forall a b. (a -> b) -> a -> b
$ String -> [String]
words "X0 X1 XT XS XF XE BC LB RG XI XJ YI YJ"


data BrokenMD = BrokenMD Bytes Refseq Int [Cigar] [MdOp] [Nucleotides] [Nucleotides] deriving (Typeable, Int -> BrokenMD -> ShowS
[BrokenMD] -> ShowS
BrokenMD -> String
(Int -> BrokenMD -> ShowS)
-> (BrokenMD -> String) -> ([BrokenMD] -> ShowS) -> Show BrokenMD
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [BrokenMD] -> ShowS
$cshowList :: [BrokenMD] -> ShowS
show :: BrokenMD -> String
$cshow :: BrokenMD -> String
showsPrec :: Int -> BrokenMD -> ShowS
$cshowsPrec :: Int -> BrokenMD -> ShowS
Show)
instance Exception BrokenMD where
    displayException :: BrokenMD -> String
displayException (BrokenMD qname :: Bytes
qname rname :: Refseq
rname pos :: Int
pos cigs :: [Cigar]
cigs ms :: [MdOp]
ms osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq)
        = [String] -> String
unlines [ "Broken MD field when trying to construct new MD!"
                  , "QNAME: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Bytes -> String
forall a. Show a => a -> String
show Bytes
qname
                  , "POS:   " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Word32 -> ShowS
forall a. Show a => a -> ShowS
shows (Refseq -> Word32
unRefseq Refseq
rname) ":" String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
pos
                  , "CIGAR: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Cigar] -> String
forall a. Show a => a -> String
show [Cigar]
cigs
                  , "MD:    " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [MdOp] -> String
forall a. Show a => a -> String
show [MdOp]
ms
                  , "refseq:  " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Nucleotides] -> String
forall a. Show a => a -> String
show [Nucleotides]
osq
                  , "readseq: " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [Nucleotides] -> String
forall a. Show a => a -> String
show [Nucleotides]
nsq ]


minViewBy :: (a -> a -> Ordering) -> a -> [a] -> (a,[a])
minViewBy :: (a -> a -> Ordering) -> a -> [a] -> (a, [a])
minViewBy cmp :: a -> a -> Ordering
cmp x :: a
x xs :: [a]
xs = a -> [a] -> [a] -> (a, [a])
go a
x [] [a]
xs
  where
    go :: a -> [a] -> [a] -> (a, [a])
go m :: a
m acc :: [a]
acc [    ] = (a
m,[a]
acc)
    go m :: a
m acc :: [a]
acc (a :: a
a:as :: [a]
as) = case a
m a -> a -> Ordering
`cmp` a
a of GT -> a -> [a] -> [a] -> (a, [a])
go a
a (a
ma -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc) [a]
as
                                        _  -> a -> [a] -> [a] -> (a, [a])
go a
m (a
aa -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
acc) [a]
as

data MdFail = MdFail [Cigar] [MdOp] [Nucleotides] [Nucleotides]

mk_new_md' :: [MdOp] -> [Cigar] -> [MdOp] -> [Nucleotides] -> [Nucleotides] -> Either MdFail [MdOp]
mk_new_md' :: [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' acc :: [MdOp]
acc [] [] [] [] = [MdOp] -> Either MdFail [MdOp]
forall a b. b -> Either a b
Right ([MdOp] -> Either MdFail [MdOp]) -> [MdOp] -> Either MdFail [MdOp]
forall a b. (a -> b) -> a -> b
$ [MdOp] -> [MdOp] -> [MdOp]
normalize [] [MdOp]
acc
    where
        normalize :: [MdOp] -> [MdOp] -> [MdOp]
normalize          a :: [MdOp]
a  (MdNum  0:os :: [MdOp]
os) = [MdOp] -> [MdOp] -> [MdOp]
normalize               [MdOp]
a  [MdOp]
os
        normalize (MdNum n :: Int
n:a :: [MdOp]
a) (MdNum  m :: Int
m:os :: [MdOp]
os) = [MdOp] -> [MdOp] -> [MdOp]
normalize (Int -> MdOp
MdNum  (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
m)MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
a) [MdOp]
os
        normalize          a :: [MdOp]
a  (MdDel []:os :: [MdOp]
os) = [MdOp] -> [MdOp] -> [MdOp]
normalize               [MdOp]
a  [MdOp]
os
        normalize (MdDel u :: [Nucleotides]
u:a :: [MdOp]
a) (MdDel  v :: [Nucleotides]
v:os :: [MdOp]
os) = [MdOp] -> [MdOp] -> [MdOp]
normalize ([Nucleotides] -> MdOp
MdDel ([Nucleotides]
v[Nucleotides] -> [Nucleotides] -> [Nucleotides]
forall a. [a] -> [a] -> [a]
++[Nucleotides]
u)MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
a) [MdOp]
os
        normalize          a :: [MdOp]
a  (       o :: MdOp
o:os :: [MdOp]
os) = [MdOp] -> [MdOp] -> [MdOp]
normalize            (MdOp
oMdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
a) [MdOp]
os
        normalize          a :: [MdOp]
a  [           ] = [MdOp]
a

mk_new_md' acc :: [MdOp]
acc ( _ :* 0 : cigs :: [Cigar]
cigs) mds :: [MdOp]
mds  osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
mds [Nucleotides]
osq [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc cigs :: [Cigar]
cigs (MdNum  0 : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
mds [Nucleotides]
osq [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc cigs :: [Cigar]
cigs (MdDel [] : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
mds [Nucleotides]
osq [Nucleotides]
nsq

mk_new_md' acc :: [MdOp]
acc (Mat :* u :: Int
u : cigs :: [Cigar]
cigs) (MdRep b :: Nucleotides
b : mds :: [MdOp]
mds) (_:osq :: [Nucleotides]
osq) (n :: Nucleotides
n:nsq :: [Nucleotides]
nsq)
    | Nucleotides
b Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
n    = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' (Int -> MdOp
MdNum 1 MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
uInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
cigs) [MdOp]
mds [Nucleotides]
osq [Nucleotides]
nsq
    | Bool
otherwise = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' (Nucleotides -> MdOp
MdRep Nucleotides
b MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
uInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
cigs) [MdOp]
mds [Nucleotides]
osq [Nucleotides]
nsq

mk_new_md' acc :: [MdOp]
acc (Mat :* u :: Int
u : cigs :: [Cigar]
cigs) (MdNum v :: Int
v : mds :: [MdOp]
mds) (o :: Nucleotides
o:osq :: [Nucleotides]
osq) (n :: Nucleotides
n:nsq :: [Nucleotides]
nsq)
    | Nucleotides
o Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides
n    = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' (Int -> MdOp
MdNum 1 MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
uInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
cigs) (Int -> MdOp
MdNum (Int
vInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds) [Nucleotides]
osq [Nucleotides]
nsq
    | Bool
otherwise = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' (Nucleotides -> MdOp
MdRep Nucleotides
o MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
uInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
cigs) (Int -> MdOp
MdNum (Int
vInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds) [Nucleotides]
osq [Nucleotides]
nsq

mk_new_md' acc :: [MdOp]
acc (Del :* n :: Int
n : cigs :: [Cigar]
cigs) (MdDel bs :: [Nucleotides]
bs : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq | Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== [Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Nucleotides]
bs = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' ([Nucleotides] -> MdOp
MdDel [Nucleotides]
bs MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc)         [Cigar]
cigs               [MdOp]
mds  [Nucleotides]
osq [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc (Del :* n :: Int
n : cigs :: [Cigar]
cigs) (MdDel (b :: Nucleotides
b:bs :: [Nucleotides]
bs) : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' ([Nucleotides] -> MdOp
MdDel     [Nucleotides
b] MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cigs) ([Nucleotides] -> MdOp
MdDel    [Nucleotides]
bsMdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
mds) [Nucleotides]
osq [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc (Del :* n :: Int
n : cigs :: [Cigar]
cigs) (MdRep   b :: Nucleotides
b    : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' ([Nucleotides] -> MdOp
MdDel     [Nucleotides
b] MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cigs)              [MdOp]
mds  [Nucleotides]
osq [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc (Del :* n :: Int
n : cigs :: [Cigar]
cigs) (MdNum   m :: Int
m    : mds :: [MdOp]
mds) osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' ([Nucleotides] -> MdOp
MdDel [Nucleotides
nucsN] MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
acc) (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cigs) (Int -> MdOp
MdNum (Int
mInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
mds) [Nucleotides]
osq [Nucleotides]
nsq

mk_new_md' acc :: [MdOp]
acc (Ins :* n :: Int
n : cigs :: [Cigar]
cigs) md :: [MdOp]
md osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
md (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
osq) (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
nsq)
mk_new_md' acc :: [MdOp]
acc (SMa :* n :: Int
n : cigs :: [Cigar]
cigs) md :: [MdOp]
md osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
md (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
osq) (Int -> [Nucleotides] -> [Nucleotides]
forall a. Int -> [a] -> [a]
drop Int
n [Nucleotides]
nsq)
mk_new_md' acc :: [MdOp]
acc (HMa :* _ : cigs :: [Cigar]
cigs) md :: [MdOp]
md osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
md         [Nucleotides]
osq          [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc (Pad :* _ : cigs :: [Cigar]
cigs) md :: [MdOp]
md osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
md         [Nucleotides]
osq          [Nucleotides]
nsq
mk_new_md' acc :: [MdOp]
acc (Nop :* _ : cigs :: [Cigar]
cigs) md :: [MdOp]
md osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = [MdOp]
-> [Cigar]
-> [MdOp]
-> [Nucleotides]
-> [Nucleotides]
-> Either MdFail [MdOp]
mk_new_md' [MdOp]
acc [Cigar]
cigs [MdOp]
md         [Nucleotides]
osq          [Nucleotides]
nsq

mk_new_md' _acc :: [MdOp]
_acc cigs :: [Cigar]
cigs ms :: [MdOp]
ms osq :: [Nucleotides]
osq nsq :: [Nucleotides]
nsq = MdFail -> Either MdFail [MdOp]
forall a b. a -> Either a b
Left (MdFail -> Either MdFail [MdOp]) -> MdFail -> Either MdFail [MdOp]
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> [Nucleotides] -> [Nucleotides] -> MdFail
MdFail [Cigar]
cigs [MdOp]
ms [Nucleotides]
osq [Nucleotides]
nsq

consensus :: Qual -> [ (Nucleotides, Qual) ] -> (Nucleotides, Qual)
consensus :: Qual -> [(Nucleotides, Qual)] -> (Nucleotides, Qual)
consensus (Q maxq :: Word8
maxq) nqs :: [(Nucleotides, Qual)]
nqs =
    let accs :: U.Vector Int
        accs :: Vector Int
accs = (Int -> Int -> Int) -> Vector Int -> [(Int, Int)] -> Vector Int
forall a b.
Unbox a =>
(a -> b -> a) -> Vector a -> [(Int, b)] -> Vector a
U.accum Int -> Int -> Int
forall a. Num a => a -> a -> a
(+) (Int -> Int -> Vector Int
forall a. Unbox a => Int -> a -> Vector a
U.replicate 16 0) [ (Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
n, Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
q) | (Ns n :: Word8
n,Q q :: Word8
q) <- [(Nucleotides, Qual)]
nqs ]
    in case ((Nucleotides, Int) -> (Nucleotides, Int) -> Ordering)
-> [(Nucleotides, Int)] -> [(Nucleotides, Int)]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (((Nucleotides, Int) -> (Nucleotides, Int) -> Ordering)
-> (Nucleotides, Int) -> (Nucleotides, Int) -> Ordering
forall a b c. (a -> b -> c) -> b -> a -> c
flip (((Nucleotides, Int) -> (Nucleotides, Int) -> Ordering)
 -> (Nucleotides, Int) -> (Nucleotides, Int) -> Ordering)
-> ((Nucleotides, Int) -> (Nucleotides, Int) -> Ordering)
-> (Nucleotides, Int)
-> (Nucleotides, Int)
-> Ordering
forall a b. (a -> b) -> a -> b
$ ((Nucleotides, Int) -> Int)
-> (Nucleotides, Int) -> (Nucleotides, Int) -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing (Nucleotides, Int) -> Int
forall a b. (a, b) -> b
snd) ([(Nucleotides, Int)] -> [(Nucleotides, Int)])
-> [(Nucleotides, Int)] -> [(Nucleotides, Int)]
forall a b. (a -> b) -> a -> b
$ [Nucleotides] -> [Int] -> [(Nucleotides, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [Word8 -> Nucleotides
Ns 0 ..] ([Int] -> [(Nucleotides, Int)]) -> [Int] -> [(Nucleotides, Int)]
forall a b. (a -> b) -> a -> b
$ Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
U.toList Vector Int
accs of
        (n0 :: Nucleotides
n0,q0 :: Int
q0) : (_,q1 :: Int
q1) : _ | Word8
qr Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
> 3 -> (Nucleotides
n0, Word8 -> Qual
Q Word8
qr)
            where qr :: Word8
qr = Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> Int -> Word8
forall a b. (a -> b) -> a -> b
$ (Int
q0Int -> Int -> Int
forall a. Num a => a -> a -> a
-Int
q1) Int -> Int -> Int
forall a. Ord a => a -> a -> a
`min` Word8 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Word8
maxq
        _                             -> (Nucleotides
nucsN, Word8 -> Qual
Q 0)


-- Cheap version: simply takes the lexically first record, adds XP field
do_cheap_collapse :: Monad m => [BamRec] -> m ( Decision, [BamRec] )
do_cheap_collapse :: [BamRec] -> m (Decision, [BamRec])
do_cheap_collapse [    ] = (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Representative            BamRec
nullBamRec, [] )
do_cheap_collapse [  b :: BamRec
b ] = (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Representative                     BamRec
b, [] )
do_cheap_collapse (b :: BamRec
b:bs :: [BamRec]
bs) = (Decision, [BamRec]) -> m (Decision, [BamRec])
forall (f :: * -> *) a. Applicative f => a -> f a
pure ( BamRec -> Decision
Representative (BamRec -> Decision) -> BamRec -> Decision
forall a b. (a -> b) -> a -> b
$ Int -> BamRec -> BamRec
replaceXP Int
new_xp BamRec
b0, [BamRec]
bx )
  where
    (b0 :: BamRec
b0, bx :: [BamRec]
bx) = (BamRec -> BamRec -> Ordering)
-> BamRec -> [BamRec] -> (BamRec, [BamRec])
forall a. (a -> a -> Ordering) -> a -> [a] -> (a, [a])
minViewBy ((BamRec -> Bytes) -> BamRec -> BamRec -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing BamRec -> Bytes
b_qname) BamRec
b [BamRec]
bs
    new_xp :: Int
new_xp   = (Int -> BamRec -> Int) -> Int -> [BamRec] -> Int
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
foldl' (\a :: Int
a -> Int -> Int -> Int
oplus Int
a (Int -> Int) -> (BamRec -> Int) -> BamRec -> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Int -> BamKey -> BamRec -> Int
extAsInt 1 "XP") (Int -> BamKey -> BamRec -> Int
extAsInt 1 "XP" BamRec
b) [BamRec]
bs

replaceXP :: Int -> BamRec -> BamRec
replaceXP :: Int -> BamRec -> BamRec
replaceXP new_xp :: Int
new_xp b0 :: BamRec
b0 = BamRec
b0 { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "XP" (Int -> Ext
Int Int
new_xp) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
b0 }

oplus :: Int -> Int -> Int
_ oplus :: Int -> Int -> Int
`oplus` (-1) = -1
(-1) `oplus` _ = -1
a :: Int
a `oplus` b :: Int
b = Int
a Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
b

-- | Normalize a read's alignment to fall into the canonical region
-- of [0..l].  Takes the name of the reference sequence and its length.
-- Returns @Left x@ if the coordinate decreased so the result is out of
-- order now, @Right x@ if the coordinate is unchanged.
normalizeTo :: Bytes -> Int -> BamRec -> Either BamRec BamRec
normalizeTo :: Bytes -> Int -> BamRec -> Either BamRec BamRec
normalizeTo nm :: Bytes
nm l :: Int
l b :: BamRec
b = BamRec -> Either BamRec BamRec
forall b. b -> Either b b
lr (BamRec -> Either BamRec BamRec) -> BamRec -> Either BamRec BamRec
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_pos :: Int
b_pos  = BamRec -> Int
b_pos BamRec
b Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l
                            , b_mpos :: Int
b_mpos = BamRec -> Int
b_mpos BamRec
b Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l
                            , b_mapq :: Qual
b_mapq = if Bool
dups_are_fine then Word8 -> Qual
Q 37 else BamRec -> Qual
b_mapq BamRec
b
                            , b_exts :: Extensions
b_exts = if Bool
dups_are_fine then BamKey -> Extensions -> Extensions
deleteE "XA" (BamRec -> Extensions
b_exts BamRec
b) else BamRec -> Extensions
b_exts BamRec
b }
  where
    lr :: b -> Either b b
lr = if BamRec -> Int
b_pos BamRec
b Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
l then b -> Either b b
forall a b. a -> Either a b
Left else b -> Either b b
forall a b. b -> Either a b
Right
    dups_are_fine :: Bool
dups_are_fine  = Bytes -> Bool
all_match_XA (BamKey -> BamRec -> Bytes
extAsString "XA" BamRec
b)
    all_match_XA :: Bytes -> Bool
all_match_XA s :: Bytes
s = case Char -> Bytes -> [Bytes]
T.split ';' Bytes
s of [xa1 :: Bytes
xa1, xa2 :: Bytes
xa2] | Bytes -> Bool
T.null Bytes
xa2 -> Bytes -> Bool
one_match_XA Bytes
xa1
                                           [xa1 :: Bytes
xa1]                   -> Bytes -> Bool
one_match_XA Bytes
xa1
                                           _                       -> Bool
False
    one_match_XA :: Bytes -> Bool
one_match_XA s :: Bytes
s = case Char -> Bytes -> [Bytes]
T.split ',' Bytes
s of (sq :: Bytes
sq:pos :: Bytes
pos:_) | Bytes
sq Bytes -> Bytes -> Bool
forall a. Eq a => a -> a -> Bool
== Bytes
nm   -> Bytes -> Bool
pos_match_XA Bytes
pos ; _ -> Bool
False
    pos_match_XA :: Bytes -> Bool
pos_match_XA s :: Bytes
s = case Bytes -> Maybe (Int, Bytes)
T.readInt Bytes
s   of Just (p :: Int
p,z :: Bytes
z) | Bytes -> Bool
T.null Bytes
z   -> Int -> Bool
int_match_XA Int
p ;   _ -> Bool
False
    int_match_XA :: Int -> Bool
int_match_XA p :: Int
p | Int
p Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= 0    =  (Int
pInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Int
b_pos BamRec
b Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l Bool -> Bool -> Bool
&& Bool -> Bool
not (BamRec -> Bool
isReversed BamRec
b)
                   | Bool
otherwise = (-Int
pInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== BamRec -> Int
b_pos BamRec
b Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
l Bool -> Bool -> Bool
&& BamRec -> Bool
isReversed BamRec
b


-- | Wraps a read to be fully contained in the canonical interval
-- [0..l].  If the read overhangs, it is duplicated and both copies are
-- suitably masked.  A piece with changed coordinate that is now out of
-- order is returned as @Left x@, if the order is fine, it is returned
-- as @Right x@.
wrapTo :: Int -> BamRec -> [Either BamRec BamRec]
wrapTo :: Int -> BamRec -> [Either BamRec BamRec]
wrapTo l :: Int
l b :: BamRec
b = if Bool
overhangs then [Either BamRec BamRec]
do_wrap else [BamRec -> Either BamRec BamRec
forall a b. b -> Either a b
Right BamRec
b]
  where
    overhangs :: Bool
overhangs = Bool -> Bool
not (BamRec -> Bool
isUnmapped BamRec
b) Bool -> Bool -> Bool
&& BamRec -> Int
b_pos BamRec
b Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
l Bool -> Bool -> Bool
&& Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< BamRec -> Int
b_pos BamRec
b Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Vector Cigar -> Int
forall (v :: * -> *). Vector v Cigar => v Cigar -> Int
alignedLength (BamRec -> Vector Cigar
b_cigar BamRec
b)

    do_wrap :: [Either BamRec BamRec]
do_wrap = case Int -> ECig -> (ECig, ECig)
split_ecig (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- BamRec -> Int
b_pos BamRec
b) (ECig -> (ECig, ECig)) -> ECig -> (ECig, ECig)
forall a b. (a -> b) -> a -> b
$ Vector Cigar -> [MdOp] -> ECig
toECig (BamRec -> Vector Cigar
b_cigar BamRec
b) ([MdOp] -> Maybe [MdOp] -> [MdOp]
forall a. a -> Maybe a -> a
fromMaybe [] (Maybe [MdOp] -> [MdOp]) -> Maybe [MdOp] -> [MdOp]
forall a b. (a -> b) -> a -> b
$ BamRec -> Maybe [MdOp]
getMd BamRec
b) of
                  (left :: ECig
left,right :: ECig
right) -> [ BamRec -> Either BamRec BamRec
forall a b. b -> Either a b
Right (BamRec -> Either BamRec BamRec) -> BamRec -> Either BamRec BamRec
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_cigar :: Vector Cigar
b_cigar = ECig -> Vector Cigar
toCigar  ECig
left }            BamRec -> ECig -> BamRec
`setMD` ECig
left
                                  , BamRec -> Either BamRec BamRec
forall a b. a -> Either a b
Left  (BamRec -> Either BamRec BamRec) -> BamRec -> Either BamRec BamRec
forall a b. (a -> b) -> a -> b
$ BamRec
b { b_cigar :: Vector Cigar
b_cigar = ECig -> Vector Cigar
toCigar ECig
right, b_pos :: Int
b_pos = 0 } BamRec -> ECig -> BamRec
`setMD` ECig
right ]

-- | Split an 'ECig' into two at some position.  The position is counted
-- in terms of the reference (therefore, deletions count, insertions
-- don't).  The parts that would be skipped if we were splitting lists
-- are replaced by soft masks.
split_ecig :: Int -> ECig -> (ECig, ECig)
split_ecig :: Int -> ECig -> (ECig, ECig)
split_ecig _    WithMD = (ECig
WithMD,       ECig
WithMD)
split_ecig _ WithoutMD = (ECig
WithoutMD, ECig
WithoutMD)
split_ecig 0       ecs :: ECig
ecs = (ECig -> ECig
mask_all ECig
ecs,    ECig
ecs)

split_ecig i :: Int
i (Ins' n :: Int
n ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig Int
i ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
Ins' Int
n ECig
u, Int -> ECig -> ECig
SMa' Int
n ECig
v)
split_ecig i :: Int
i (SMa' n :: Int
n ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig Int
i ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
SMa' Int
n ECig
u, Int -> ECig -> ECig
SMa' Int
n ECig
v)
split_ecig i :: Int
i (HMa' n :: Int
n ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig Int
i ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
HMa' Int
n ECig
u, Int -> ECig -> ECig
HMa' Int
n ECig
v)
split_ecig i :: Int
i (Pad' n :: Int
n ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig Int
i ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
Pad' Int
n ECig
u,        ECig
v)

split_ecig i :: Int
i (Mat' n :: Int
n ecs :: ECig
ecs)
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n    = case Int -> ECig -> (ECig, ECig)
split_ecig (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
n) ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
Mat' Int
n ECig
u, Int -> ECig -> ECig
SMa' Int
n ECig
v)
    | Bool
otherwise = (Int -> ECig -> ECig
Mat' Int
i (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
SMa' (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
i) (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ecs, Int -> ECig -> ECig
SMa' Int
i (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
Mat' (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
i) ECig
ecs)

split_ecig i :: Int
i (Rep' x :: Nucleotides
x ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Nucleotides -> ECig -> ECig
Rep' Nucleotides
x ECig
u, Int -> ECig -> ECig
SMa' 1 ECig
v)
split_ecig i :: Int
i (Del' x :: Nucleotides
x ecs :: ECig
ecs) = case Int -> ECig -> (ECig, ECig)
split_ecig (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Nucleotides -> ECig -> ECig
Del' Nucleotides
x ECig
u,        ECig
v)

split_ecig i :: Int
i (Nop' n :: Int
n ecs :: ECig
ecs)
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
n    = case Int -> ECig -> (ECig, ECig)
split_ecig (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
n) ECig
ecs of (u :: ECig
u,v :: ECig
v) -> (Int -> ECig -> ECig
Nop' Int
n ECig
u,        ECig
v)
    | Bool
otherwise = (Int -> ECig -> ECig
Nop' Int
i (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ecs, Int -> ECig -> ECig
Nop' (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
i) ECig
ecs)

mask_all :: ECig -> ECig
mask_all :: ECig -> ECig
mask_all      WithMD = ECig
WithMD
mask_all   WithoutMD = ECig
WithoutMD
mask_all (Nop' _ ec :: ECig
ec) =          ECig -> ECig
mask_all ECig
ec
mask_all (HMa' _ ec :: ECig
ec) =          ECig -> ECig
mask_all ECig
ec
mask_all (Pad' _ ec :: ECig
ec) =          ECig -> ECig
mask_all ECig
ec
mask_all (Del' _ ec :: ECig
ec) =          ECig -> ECig
mask_all ECig
ec
mask_all (Rep' _ ec :: ECig
ec) = Int -> ECig -> ECig
SMa' 1 (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ec
mask_all (Mat' n :: Int
n ec :: ECig
ec) = Int -> ECig -> ECig
SMa' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ec
mask_all (Ins' n :: Int
n ec :: ECig
ec) = Int -> ECig -> ECig
SMa' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ec
mask_all (SMa' n :: Int
n ec :: ECig
ec) = Int -> ECig -> ECig
SMa' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ ECig -> ECig
mask_all ECig
ec

-- | Extended CIGAR.  This subsumes both the CIGAR string and the
-- optional MD field.  If we have MD on input, we generate it on output,
-- too.  And in between, we break everything into /very small/
-- operations.

data ECig = WithMD                      -- terminate, do generate MD field
          | WithoutMD                   -- terminate, don't bother with MD
          | Mat' Int ECig
          | Rep' Nucleotides ECig
          | Ins' Int ECig
          | Del' Nucleotides ECig
          | Nop' Int ECig
          | SMa' Int ECig
          | HMa' Int ECig
          | Pad' Int ECig


toECig :: W.Vector Cigar -> [MdOp] -> ECig
toECig :: Vector Cigar -> [MdOp] -> ECig
toECig = [Cigar] -> [MdOp] -> ECig
go ([Cigar] -> [MdOp] -> ECig)
-> (Vector Cigar -> [Cigar]) -> Vector Cigar -> [MdOp] -> ECig
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector Cigar -> [Cigar]
forall a. Storable a => Vector a -> [a]
W.toList
  where
    go :: [Cigar] -> [MdOp] -> ECig
go        cs :: [Cigar]
cs  (MdNum  0:mds :: [MdOp]
mds) = [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go        cs :: [Cigar]
cs  (MdDel []:mds :: [MdOp]
mds) = [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go (_:*0 :cs :: [Cigar]
cs)           mds :: [MdOp]
mds  = [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go [        ] [            ] = ECig
WithMD               -- all was fine to the very end
    go [        ]              _ = ECig
WithoutMD            -- here it wasn't fine

    go (Mat :* n :: Int
n : cs :: [Cigar]
cs) (MdRep x :: Nucleotides
x:mds :: [MdOp]
mds)   = Nucleotides -> ECig -> ECig
Rep'   Nucleotides
x   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go     (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cs)             [MdOp]
mds
    go (Mat :* n :: Int
n : cs :: [Cigar]
cs) (MdNum m :: Int
m:mds :: [MdOp]
mds)
       | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
m                         = Int -> ECig -> ECig
Mat'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go                     [Cigar]
cs (Int -> MdOp
MdNum (Int
mInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
n)MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
mds)
       | Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m                         = Int -> ECig -> ECig
Mat'   Int
m   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go     (CigOp
Mat CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cs)             [MdOp]
mds
       | Bool
otherwise                     = Int -> ECig -> ECig
Mat'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go                     [Cigar]
cs              [MdOp]
mds
    go (Mat :* n :: Int
n : cs :: [Cigar]
cs)            _    = Int -> ECig -> ECig
Mat'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go'                    [Cigar]
cs

    go (Ins :* n :: Int
n : cs :: [Cigar]
cs)               mds :: [MdOp]
mds  = Int -> ECig -> ECig
Ins'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go                  [Cigar]
cs              [MdOp]
mds
    go (Del :* n :: Int
n : cs :: [Cigar]
cs) (MdDel (x :: Nucleotides
x:xs :: [Nucleotides]
xs):mds :: [MdOp]
mds) = Nucleotides -> ECig -> ECig
Del'   Nucleotides
x   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go  (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cs) ([Nucleotides] -> MdOp
MdDel [Nucleotides]
xsMdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
:[MdOp]
mds)
    go (Del :* n :: Int
n : cs :: [Cigar]
cs)                 _  = Nucleotides -> ECig -> ECig
Del' Nucleotides
nucsN (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cs)

    go (Nop :* n :: Int
n : cs :: [Cigar]
cs) mds :: [MdOp]
mds = Int -> ECig -> ECig
Nop' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go (SMa :* n :: Int
n : cs :: [Cigar]
cs) mds :: [MdOp]
mds = Int -> ECig -> ECig
SMa' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go (HMa :* n :: Int
n : cs :: [Cigar]
cs) mds :: [MdOp]
mds = Int -> ECig -> ECig
HMa' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds
    go (Pad :* n :: Int
n : cs :: [Cigar]
cs) mds :: [MdOp]
mds = Int -> ECig -> ECig
Pad' Int
n (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [MdOp] -> ECig
go [Cigar]
cs [MdOp]
mds

    -- We jump here once the MD fiels ran out early or was messed up.
    -- We no longer bother with it (this also happens if the MD isn't
    -- present to begin with).
    go' :: [Cigar] -> ECig
go' (_ :* 0 : cs :: [Cigar]
cs)   = [Cigar] -> ECig
go' [Cigar]
cs
    go' [           ]   = ECig
WithoutMD                        -- we didn't have MD or it was broken

    go' (Mat :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
Mat'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go'                 [Cigar]
cs
    go' (Ins :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
Ins'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go'                 [Cigar]
cs
    go' (Del :* n :: Int
n : cs :: [Cigar]
cs) = Nucleotides -> ECig -> ECig
Del' Nucleotides
nucsN (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' (CigOp
Del CigOp -> Int -> Cigar
:* (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: [Cigar]
cs)

    go' (Nop :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
Nop'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' [Cigar]
cs
    go' (SMa :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
SMa'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' [Cigar]
cs
    go' (HMa :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
HMa'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' [Cigar]
cs
    go' (Pad :* n :: Int
n : cs :: [Cigar]
cs) = Int -> ECig -> ECig
Pad'   Int
n   (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ [Cigar] -> ECig
go' [Cigar]
cs


-- We normalize matches, deletions and soft masks, because these are the
-- operations we generate.  Everything else is either already normalized
-- or nobody really cares anyway.
toCigar :: ECig -> W.Vector Cigar
toCigar :: ECig -> Vector Cigar
toCigar = [Cigar] -> Vector Cigar
forall (v :: * -> *) a. Vector v a => [a] -> v a
V.fromList ([Cigar] -> Vector Cigar)
-> (ECig -> [Cigar]) -> ECig -> Vector Cigar
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ECig -> [Cigar]
go
  where
    go :: ECig -> [Cigar]
go       WithMD = []
    go    WithoutMD = []

    go (Ins' n :: Int
n ecs :: ECig
ecs) = CigOp
Ins CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs
    go (Nop' n :: Int
n ecs :: ECig
ecs) = CigOp
Nop CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs
    go (HMa' n :: Int
n ecs :: ECig
ecs) = CigOp
HMa CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs
    go (Pad' n :: Int
n ecs :: ECig
ecs) = CigOp
Pad CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs
    go (SMa' n :: Int
n ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_sma Int
n ECig
ecs
    go (Mat' n :: Int
n ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_mat Int
n ECig
ecs
    go (Rep' _ ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_mat 1 ECig
ecs
    go (Del' _ ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_del 1 ECig
ecs

    go_sma :: Int -> ECig -> [Cigar]
go_sma !Int
n (SMa' m :: Int
m ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_sma (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
m) ECig
ecs
    go_sma !Int
n         ecs :: ECig
ecs  = CigOp
SMa CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs

    go_mat :: Int -> ECig -> [Cigar]
go_mat !Int
n (Mat' m :: Int
m ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_mat (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
m) ECig
ecs
    go_mat !Int
n (Rep' _ ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_mat (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) ECig
ecs
    go_mat !Int
n         ecs :: ECig
ecs  = CigOp
Mat CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs

    go_del :: Int -> ECig -> [Cigar]
go_del !Int
n (Del' _ ecs :: ECig
ecs) = Int -> ECig -> [Cigar]
go_del (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) ECig
ecs
    go_del !Int
n         ecs :: ECig
ecs  = CigOp
Del CigOp -> Int -> Cigar
:* Int
n Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
: ECig -> [Cigar]
go ECig
ecs



-- | Create an MD field from an extended CIGAR and place it in a record.
-- We build it piecemeal (in 'go'), call out to 'addNum', 'addRep',
-- 'addDel' to make sure the operations are not generated in a
-- degenerate manner, and finally check if we're even supposed to create
-- an MD field.
setMD :: BamRec -> ECig -> BamRec
setMD :: BamRec -> ECig -> BamRec
setMD b :: BamRec
b ec :: ECig
ec = case ECig -> Maybe [MdOp]
go ECig
ec of
    Just md :: [MdOp]
md -> BamRec
b { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "MD" (Bytes -> Ext
Text (Bytes -> Ext) -> Bytes -> Ext
forall a b. (a -> b) -> a -> b
$ [MdOp] -> Bytes
showMd [MdOp]
md) (BamRec -> Extensions
b_exts BamRec
b) }
    Nothing -> BamRec
b { b_exts :: Extensions
b_exts = BamKey -> Extensions -> Extensions
deleteE "MD"                    (BamRec -> Extensions
b_exts BamRec
b) }
  where
    go :: ECig -> Maybe [MdOp]
go  WithMD      = [MdOp] -> Maybe [MdOp]
forall a. a -> Maybe a
Just []
    go  WithoutMD   = Maybe [MdOp]
forall a. Maybe a
Nothing

    go (Ins' _ ecs :: ECig
ecs) = ECig -> Maybe [MdOp]
go ECig
ecs
    go (Nop' _ ecs :: ECig
ecs) = ECig -> Maybe [MdOp]
go ECig
ecs
    go (SMa' _ ecs :: ECig
ecs) = ECig -> Maybe [MdOp]
go ECig
ecs
    go (HMa' _ ecs :: ECig
ecs) = ECig -> Maybe [MdOp]
go ECig
ecs
    go (Pad' _ ecs :: ECig
ecs) = ECig -> Maybe [MdOp]
go ECig
ecs
    go (Mat' n :: Int
n ecs :: ECig
ecs) = (if Int
n Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
==  0 then Maybe [MdOp] -> Maybe [MdOp]
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else ([MdOp] -> [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Int -> [MdOp] -> [MdOp]
addNum Int
n)) (Maybe [MdOp] -> Maybe [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall a b. (a -> b) -> a -> b
$ ECig -> Maybe [MdOp]
go ECig
ecs
    go (Rep' x :: Nucleotides
x ecs :: ECig
ecs) = (if Nucleotides -> Bool
isGap Nucleotides
x then Maybe [MdOp] -> Maybe [MdOp]
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else ([MdOp] -> [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Nucleotides -> [MdOp] -> [MdOp]
addRep Nucleotides
x)) (Maybe [MdOp] -> Maybe [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall a b. (a -> b) -> a -> b
$ ECig -> Maybe [MdOp]
go ECig
ecs
    go (Del' x :: Nucleotides
x ecs :: ECig
ecs) = (if Nucleotides -> Bool
isGap Nucleotides
x then Maybe [MdOp] -> Maybe [MdOp]
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else ([MdOp] -> [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Nucleotides -> [MdOp] -> [MdOp]
addDel Nucleotides
x)) (Maybe [MdOp] -> Maybe [MdOp]) -> Maybe [MdOp] -> Maybe [MdOp]
forall a b. (a -> b) -> a -> b
$ ECig -> Maybe [MdOp]
go ECig
ecs

    addNum :: Int -> [MdOp] -> [MdOp]
addNum n :: Int
n (MdNum m :: Int
m : mds :: [MdOp]
mds) = Int -> MdOp
MdNum (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
m) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds
    addNum n :: Int
n            mds :: [MdOp]
mds  = Int -> MdOp
MdNum   Int
n   MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds

    addRep :: Nucleotides -> [MdOp] -> [MdOp]
addRep x :: Nucleotides
x            mds :: [MdOp]
mds  = Nucleotides -> MdOp
MdRep   Nucleotides
x   MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds

    addDel :: Nucleotides -> [MdOp] -> [MdOp]
addDel x :: Nucleotides
x (MdDel y :: [Nucleotides]
y : mds :: [MdOp]
mds) = [Nucleotides] -> MdOp
MdDel (Nucleotides
xNucleotides -> [Nucleotides] -> [Nucleotides]
forall a. a -> [a] -> [a]
:[Nucleotides]
y) MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds
    addDel x :: Nucleotides
x            mds :: [MdOp]
mds  = [Nucleotides] -> MdOp
MdDel  [Nucleotides
x]  MdOp -> [MdOp] -> [MdOp]
forall a. a -> [a] -> [a]
: [MdOp]
mds