-- | Trimming and fusing of reads as found in BAM files.
--
-- This API is remarkably ugly because the core loop is implemented in
-- C.  This requires the adapters to be in storable vectors, and since
-- they shouldn't be constantly copied around, the ugly 'withADSeqs'
-- function is needed.  The performance gain seems to be worth it,
-- though.

module Bio.Bam.Trim
        ( trim_3
        , trim_3'
        , trim_low_quality
        , AD_Seqs
        , withADSeqs
        , default_fwd_adapters
        , default_rev_adapters
        , find_merge
        , mergeBam
        , find_trim
        , trimBam
        , merged_seq
        , merged_qual
        ) where

import Bio.Bam.Header
import Bio.Bam.Rec
import Bio.Bam.Rmdup               ( ECig(..), setMD, toECig )
import Bio.Prelude
import Control.Monad.Trans.Control ( MonadBaseControl, control )
import Foreign.C.Types             ( CInt(..) )
import Foreign.Marshal.Array       ( allocaArray )

import qualified Data.ByteString                        as B
import qualified Data.Vector.Generic                    as V
import qualified Data.Vector.Storable                   as W

-- | Trims from the 3' end of a sequence.
-- @trim_3' p b@ trims the 3' end of the sequence in @b@ at the
-- earliest position such that @p@ evaluates to true on every suffix
-- that was trimmed off.  Note that the 3' end may be the beginning of
-- the sequence if it happens to be stored in reverse-complemented form.
-- Also note that trimming from the 3' end may not make sense for reads
-- that were constructed by merging paired end data (but we cannot take
-- care of that here).  Further note that trimming may break dependent
-- information, notably the "mate" information and many optional fields.
-- Since the intention is to trim based on quality scores, reads without
-- qualities are passed along unchanged.

trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec
trim_3' :: ([Nucleotides] -> [Qual] -> Bool) -> BamRec -> BamRec
trim_3' p :: [Nucleotides] -> [Qual] -> Bool
p b :: BamRec
b = case BamRec -> Maybe (Vector Qual)
b_qual BamRec
b of
    Nothing                         ->  BamRec
b
    Just qs :: Vector Qual
qs | BamRec -> Int
b_flag BamRec
b Int -> Int -> Bool
forall a. Bits a => a -> Int -> Bool
`testBit` 4  ->  Int -> BamRec -> BamRec
trim_3 Int
len_rev BamRec
b
            | Bool
otherwise             ->  Int -> BamRec -> BamRec
trim_3 Int
len_fwd BamRec
b
      where
        len_fwd :: Int
len_fwd = Int -> Int -> Int
forall a. Num a => a -> a -> a
subtract 1 (Int -> Int)
-> ([([Nucleotides], [Qual])] -> Int)
-> [([Nucleotides], [Qual])]
-> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [([Nucleotides], [Qual])] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([([Nucleotides], [Qual])] -> Int)
-> ([([Nucleotides], [Qual])] -> [([Nucleotides], [Qual])])
-> [([Nucleotides], [Qual])]
-> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (([Nucleotides], [Qual]) -> Bool)
-> [([Nucleotides], [Qual])] -> [([Nucleotides], [Qual])]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (([Nucleotides] -> [Qual] -> Bool)
-> ([Nucleotides], [Qual]) -> Bool
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry [Nucleotides] -> [Qual] -> Bool
p) ([([Nucleotides], [Qual])] -> Int)
-> [([Nucleotides], [Qual])] -> Int
forall a b. (a -> b) -> a -> b
$
                      [[Nucleotides]] -> [[Qual]] -> [([Nucleotides], [Qual])]
forall a b. [a] -> [b] -> [(a, b)]
zip ([Nucleotides] -> [[Nucleotides]]
forall a. [a] -> [[a]]
inits ([Nucleotides] -> [[Nucleotides]])
-> (Vector_Nucs_half Nucleotides -> [Nucleotides])
-> Vector_Nucs_half Nucleotides
-> [[Nucleotides]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [Nucleotides] -> [Nucleotides]
forall a. [a] -> [a]
reverse ([Nucleotides] -> [Nucleotides])
-> (Vector_Nucs_half Nucleotides -> [Nucleotides])
-> Vector_Nucs_half Nucleotides
-> [Nucleotides]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector_Nucs_half Nucleotides -> [Nucleotides]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList (Vector_Nucs_half Nucleotides -> [[Nucleotides]])
-> Vector_Nucs_half Nucleotides -> [[Nucleotides]]
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
b)
                          ([Qual] -> [[Qual]]
forall a. [a] -> [[a]]
inits ([Qual] -> [[Qual]])
-> (Vector Qual -> [Qual]) -> Vector Qual -> [[Qual]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [Qual] -> [Qual]
forall a. [a] -> [a]
reverse ([Qual] -> [Qual])
-> (Vector Qual -> [Qual]) -> Vector Qual -> [Qual]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector Qual -> [Qual]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList (Vector Qual -> [[Qual]]) -> Vector Qual -> [[Qual]]
forall a b. (a -> b) -> a -> b
$ Vector Qual
qs)

        len_rev :: Int
len_rev = Int -> Int -> Int
forall a. Num a => a -> a -> a
subtract 1 (Int -> Int)
-> ([([Nucleotides], [Qual])] -> Int)
-> [([Nucleotides], [Qual])]
-> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. [([Nucleotides], [Qual])] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length ([([Nucleotides], [Qual])] -> Int)
-> ([([Nucleotides], [Qual])] -> [([Nucleotides], [Qual])])
-> [([Nucleotides], [Qual])]
-> Int
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (([Nucleotides], [Qual]) -> Bool)
-> [([Nucleotides], [Qual])] -> [([Nucleotides], [Qual])]
forall a. (a -> Bool) -> [a] -> [a]
takeWhile (([Nucleotides] -> [Qual] -> Bool)
-> ([Nucleotides], [Qual]) -> Bool
forall a b c. (a -> b -> c) -> (a, b) -> c
uncurry [Nucleotides] -> [Qual] -> Bool
p) ([([Nucleotides], [Qual])] -> Int)
-> [([Nucleotides], [Qual])] -> Int
forall a b. (a -> b) -> a -> b
$
                      [[Nucleotides]] -> [[Qual]] -> [([Nucleotides], [Qual])]
forall a b. [a] -> [b] -> [(a, b)]
zip ([Nucleotides] -> [[Nucleotides]]
forall a. [a] -> [[a]]
inits ([Nucleotides] -> [[Nucleotides]])
-> (Vector_Nucs_half Nucleotides -> [Nucleotides])
-> Vector_Nucs_half Nucleotides
-> [[Nucleotides]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector_Nucs_half Nucleotides -> [Nucleotides]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList (Vector_Nucs_half Nucleotides -> [[Nucleotides]])
-> Vector_Nucs_half Nucleotides -> [[Nucleotides]]
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
b)
                          ([Qual] -> [[Qual]]
forall a. [a] -> [[a]]
inits ([Qual] -> [[Qual]])
-> (Vector Qual -> [Qual]) -> Vector Qual -> [[Qual]]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector Qual -> [Qual]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList (Vector Qual -> [[Qual]]) -> Vector Qual -> [[Qual]]
forall a b. (a -> b) -> a -> b
$ Vector Qual
qs)


trim_3 :: Int -> BamRec -> BamRec
trim_3 :: Int -> BamRec -> BamRec
trim_3 l :: Int
l b :: BamRec
b | BamRec -> Int
b_flag BamRec
b Int -> Int -> Bool
forall a. Bits a => a -> Int -> Bool
`testBit` 4 = BamRec
trim_rev
           | Bool
otherwise            = BamRec
trim_fwd
  where
    trim_fwd :: BamRec
trim_fwd = let (_, cigar' :: Vector Cigar
cigar') = Vector Cigar -> Int -> (Int, Vector Cigar)
forall (v :: * -> *).
Vector v Cigar =>
v Cigar -> Int -> (Int, v Cigar)
trim_back_cigar (BamRec -> Vector Cigar
b_cigar BamRec
b) Int
l
                   c :: BamRec
c = (ECig -> ECig) -> BamRec -> BamRec
modMd (Int -> ECig -> ECig
takeECig (Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
b) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l)) BamRec
b
               in BamRec
c { b_seq :: Vector_Nucs_half Nucleotides
b_seq   = Int -> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
c) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l)  (Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall a b. (a -> b) -> a -> b
$  BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
c
                    , b_qual :: Maybe (Vector Qual)
b_qual  = Int -> Vector Qual -> Vector Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
c) Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l) (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
c
                    , b_cigar :: Vector Cigar
b_cigar = Vector Cigar
cigar'
                    , b_exts :: Extensions
b_exts  = ((BamKey, Ext) -> (BamKey, Ext)) -> Extensions -> Extensions
forall a b. (a -> b) -> [a] -> [b]
map (\(k :: BamKey
k,e :: Ext
e) -> case Ext
e of
                                        Text t :: Bytes
t | BamKey
k BamKey -> [BamKey] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [BamKey]
trim_set
                                          -> (BamKey
k, Bytes -> Ext
Text (Int -> Bytes -> Bytes
B.take (Bytes -> Int
B.length Bytes
t Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
l) Bytes
t))
                                        _ -> (BamKey
k,Ext
e)
                                    ) (BamRec -> Extensions
b_exts BamRec
c) }

    trim_rev :: BamRec
trim_rev = let (off :: Int
off, cigar' :: Vector Cigar
cigar') = Vector Cigar -> Int -> (Int, Vector Cigar)
forall (v :: * -> *).
Vector v Cigar =>
v Cigar -> Int -> (Int, v Cigar)
trim_fwd_cigar (BamRec -> Vector Cigar
b_cigar BamRec
b) Int
l
                   c :: BamRec
c = (ECig -> ECig) -> BamRec -> BamRec
modMd (Int -> ECig -> ECig
dropECig Int
l) BamRec
b
               in BamRec
c { b_seq :: Vector_Nucs_half Nucleotides
b_seq   = Int -> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop Int
l  (Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall a b. (a -> b) -> a -> b
$  BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
c
                    , b_qual :: Maybe (Vector Qual)
b_qual  = Int -> Vector Qual -> Vector Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop Int
l (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
c
                    , b_pos :: Int
b_pos   = BamRec -> Int
b_pos BamRec
c Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
off
                    , b_cigar :: Vector Cigar
b_cigar = Vector Cigar
cigar'
                    , b_exts :: Extensions
b_exts  = ((BamKey, Ext) -> (BamKey, Ext)) -> Extensions -> Extensions
forall a b. (a -> b) -> [a] -> [b]
map (\(k :: BamKey
k,e :: Ext
e) -> case Ext
e of
                                        Text t :: Bytes
t | BamKey
k BamKey -> [BamKey] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [BamKey]
trim_set
                                          -> (BamKey
k, Bytes -> Ext
Text (Int -> Bytes -> Bytes
B.drop Int
l Bytes
t))
                                        _ -> (BamKey
k,Ext
e)
                                    ) (BamRec -> Extensions
b_exts BamRec
c) }

    trim_set :: [BamKey]
trim_set = ["BQ","CQ","CS","E2","OQ","U2"]

    modMd :: (ECig -> ECig) -> BamRec -> BamRec
    modMd :: (ECig -> ECig) -> BamRec -> BamRec
modMd f :: ECig -> ECig
f br :: BamRec
br = BamRec -> ([MdOp] -> BamRec) -> Maybe [MdOp] -> BamRec
forall b a. b -> (a -> b) -> Maybe a -> b
maybe BamRec
br (BamRec -> ECig -> BamRec
setMD BamRec
br (ECig -> BamRec) -> ([MdOp] -> ECig) -> [MdOp] -> BamRec
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. ECig -> ECig
f (ECig -> ECig) -> ([MdOp] -> ECig) -> [MdOp] -> ECig
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Vector Cigar -> [MdOp] -> ECig
toECig (BamRec -> Vector Cigar
b_cigar BamRec
br)) (BamRec -> Maybe [MdOp]
getMd BamRec
br)

    endOf :: ECig -> ECig
    endOf :: ECig -> ECig
endOf  WithMD     = ECig
WithMD
    endOf  WithoutMD  = ECig
WithoutMD
    endOf (Mat' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (Ins' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (SMa' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (Rep' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (Del' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (Nop' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (HMa' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es
    endOf (Pad' _ es :: ECig
es) = ECig -> ECig
endOf ECig
es

    takeECig :: Int -> ECig -> ECig
    takeECig :: Int -> ECig -> ECig
takeECig 0  es :: ECig
es          = ECig -> ECig
endOf ECig
es
    takeECig _  WithMD      = ECig
WithMD
    takeECig _  WithoutMD   = ECig
WithoutMD
    takeECig n :: Int
n (Mat' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
Mat' Int
n  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
takeECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else ECig
WithMD
    takeECig n :: Int
n (Ins' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
Ins' Int
n  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
takeECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else ECig
WithMD
    takeECig n :: Int
n (SMa' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
SMa' Int
n  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
takeECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else ECig
WithMD
    takeECig n :: Int
n (Rep' ns :: Nucleotides
ns es :: ECig
es) = Nucleotides -> ECig -> ECig
Rep' Nucleotides
ns (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
takeECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) ECig
es
    takeECig n :: Int
n (Del' ns :: Nucleotides
ns es :: ECig
es) = Nucleotides -> ECig -> ECig
Del' Nucleotides
ns (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
takeECig Int
n ECig
es
    takeECig n :: Int
n (Nop' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
Nop' Int
m  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
takeECig Int
n ECig
es
    takeECig n :: Int
n (HMa' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
HMa' Int
m  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
takeECig Int
n ECig
es
    takeECig n :: Int
n (Pad' m :: Int
m  es :: ECig
es) = Int -> ECig -> ECig
Pad' Int
m  (ECig -> ECig) -> ECig -> ECig
forall a b. (a -> b) -> a -> b
$ Int -> ECig -> ECig
takeECig Int
n ECig
es

    dropECig :: Int -> ECig -> ECig
    dropECig :: Int -> ECig -> ECig
dropECig 0  es :: ECig
es         = ECig
es
    dropECig _  WithMD     = ECig
WithMD
    dropECig _  WithoutMD  = ECig
WithoutMD
    dropECig n :: Int
n (Mat' m :: Int
m es :: ECig
es) = if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
dropECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else Int -> ECig -> ECig
Mat' Int
n ECig
WithMD
    dropECig n :: Int
n (Ins' m :: Int
m es :: ECig
es) = if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
dropECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else Int -> ECig -> ECig
Ins' Int
n ECig
WithMD
    dropECig n :: Int
n (SMa' m :: Int
m es :: ECig
es) = if Int
n Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
m then Int -> ECig -> ECig
dropECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
m) ECig
es else Int -> ECig -> ECig
SMa' Int
n ECig
WithMD
    dropECig n :: Int
n (Rep' _ es :: ECig
es) = Int -> ECig -> ECig
dropECig (Int
nInt -> Int -> Int
forall a. Num a => a -> a -> a
-1) ECig
es
    dropECig n :: Int
n (Del' _ es :: ECig
es) = Int -> ECig -> ECig
dropECig Int
n ECig
es
    dropECig n :: Int
n (Nop' _ es :: ECig
es) = Int -> ECig -> ECig
dropECig Int
n ECig
es
    dropECig n :: Int
n (HMa' _ es :: ECig
es) = Int -> ECig -> ECig
dropECig Int
n ECig
es
    dropECig n :: Int
n (Pad' _ es :: ECig
es) = Int -> ECig -> ECig
dropECig Int
n ECig
es


trim_back_cigar, trim_fwd_cigar :: V.Vector v Cigar => v Cigar -> Int -> ( Int, v Cigar )
trim_back_cigar :: v Cigar -> Int -> (Int, v Cigar)
trim_back_cigar c :: v Cigar
c l :: Int
l = (Int
o, [Cigar] -> v Cigar
forall (v :: * -> *) a. Vector v a => [a] -> v a
V.fromList ([Cigar] -> v Cigar) -> [Cigar] -> v Cigar
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [Cigar]
forall a. [a] -> [a]
reverse [Cigar]
c') where (o :: Int
o,c' :: [Cigar]
c') = (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar ((Int, [Cigar]) -> (Int, [Cigar]))
-> ([Cigar] -> (Int, [Cigar])) -> [Cigar] -> (Int, [Cigar])
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. Int -> [Cigar] -> (Int, [Cigar])
trim_cigar Int
l ([Cigar] -> (Int, [Cigar])) -> [Cigar] -> (Int, [Cigar])
forall a b. (a -> b) -> a -> b
$ [Cigar] -> [Cigar]
forall a. [a] -> [a]
reverse ([Cigar] -> [Cigar]) -> [Cigar] -> [Cigar]
forall a b. (a -> b) -> a -> b
$ v Cigar -> [Cigar]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList v Cigar
c
trim_fwd_cigar :: v Cigar -> Int -> (Int, v Cigar)
trim_fwd_cigar  c :: v Cigar
c l :: Int
l = (Int
o, [Cigar] -> v Cigar
forall (v :: * -> *) a. Vector v a => [a] -> v a
V.fromList           [Cigar]
c') where (o :: Int
o,c' :: [Cigar]
c') = (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar ((Int, [Cigar]) -> (Int, [Cigar]))
-> (Int, [Cigar]) -> (Int, [Cigar])
forall a b. (a -> b) -> a -> b
$ Int -> [Cigar] -> (Int, [Cigar])
trim_cigar Int
l ([Cigar] -> (Int, [Cigar])) -> [Cigar] -> (Int, [Cigar])
forall a b. (a -> b) -> a -> b
$ v Cigar -> [Cigar]
forall (v :: * -> *) a. Vector v a => v a -> [a]
V.toList v Cigar
c

sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar :: (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar (o :: Int
o, [        ])                          = (Int
o, [])
sanitize_cigar (o :: Int
o, (op :: CigOp
op:*l :: Int
l):xs :: [Cigar]
xs) | CigOp
op CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Pad              = (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar (Int
o,[Cigar]
xs)         -- del P
                               | CigOp
op CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Del Bool -> Bool -> Bool
|| CigOp
op CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Nop = (Int, [Cigar]) -> (Int, [Cigar])
sanitize_cigar (Int
o Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
l, [Cigar]
xs)    -- adjust D,N
                               | CigOp
op CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Ins              = (Int
o, (CigOp
SMa CigOp -> Int -> Cigar
:* Int
l)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
xs)            -- I --> S
                               | Bool
otherwise              = (Int
o, (CigOp
op CigOp -> Int -> Cigar
:* Int
l)Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
xs)             -- rest is fine

trim_cigar :: Int -> [Cigar] -> (Int, [Cigar])
trim_cigar :: Int -> [Cigar] -> (Int, [Cigar])
trim_cigar 0 cs :: [Cigar]
cs = (0, [Cigar]
cs)
trim_cigar _ [] = (0, [])
trim_cigar l :: Int
l ((op :: CigOp
op:*ll :: Int
ll):cs :: [Cigar]
cs) | CigOp -> Bool
bad_op CigOp
op = let (o :: Int
o,cs' :: [Cigar]
cs') = Int -> [Cigar] -> (Int, [Cigar])
trim_cigar Int
l [Cigar]
cs in (Int
o Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CigOp -> Int -> Int
forall a. Num a => CigOp -> a -> a
reflen CigOp
op Int
ll, [Cigar]
cs')
                           | Bool
otherwise = case Int
l Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
`compare` Int
ll of
    LT -> (CigOp -> Int -> Int
forall a. Num a => CigOp -> a -> a
reflen CigOp
op  Int
l, (CigOp
op CigOp -> Int -> Cigar
:* (Int
llInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
l))Cigar -> [Cigar] -> [Cigar]
forall a. a -> [a] -> [a]
:[Cigar]
cs)
    EQ -> (CigOp -> Int -> Int
forall a. Num a => CigOp -> a -> a
reflen CigOp
op Int
ll,                [Cigar]
cs)
    GT -> let (o :: Int
o,cs' :: [Cigar]
cs') = Int -> [Cigar] -> (Int, [Cigar])
trim_cigar (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
ll) [Cigar]
cs in (Int
o Int -> Int -> Int
forall a. Num a => a -> a -> a
+ CigOp -> Int -> Int
forall a. Num a => CigOp -> a -> a
reflen CigOp
op Int
ll, [Cigar]
cs')

  where
    reflen :: CigOp -> a -> a
reflen op' :: CigOp
op' = if CigOp -> Bool
ref_op CigOp
op' then a -> a
forall k (cat :: k -> k -> *) (a :: k). Category cat => cat a a
id else a -> a -> a
forall a b. a -> b -> a
const 0
    bad_op :: CigOp -> Bool
bad_op o :: CigOp
o = CigOp
o CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
/= CigOp
Mat Bool -> Bool -> Bool
&& CigOp
o CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
/= CigOp
Ins Bool -> Bool -> Bool
&& CigOp
o CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
/= CigOp
SMa
    ref_op :: CigOp -> Bool
ref_op o :: CigOp
o = CigOp
o CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Mat Bool -> Bool -> Bool
|| CigOp
o CigOp -> CigOp -> Bool
forall a. Eq a => a -> a -> Bool
== CigOp
Del


-- | Trim predicate to get rid of low quality sequence.
-- @trim_low_quality q ns qs@ evaluates to true if all qualities in @qs@
-- are smaller (i.e. worse) than @q@.
trim_low_quality :: Qual -> a -> [Qual] -> Bool
trim_low_quality :: Qual -> a -> [Qual] -> Bool
trim_low_quality q :: Qual
q = ([Qual] -> Bool) -> a -> [Qual] -> Bool
forall a b. a -> b -> a
const (([Qual] -> Bool) -> a -> [Qual] -> Bool)
-> ([Qual] -> Bool) -> a -> [Qual] -> Bool
forall a b. (a -> b) -> a -> b
$ (Qual -> Bool) -> [Qual] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (Qual -> Qual -> Bool
forall a. Ord a => a -> a -> Bool
< Qual
q)


-- | Finds the merge point.  Input is list of forward adapters, list of
-- reverse adapters, sequence1, quality1, sequence2, quality2; output is
-- merge point and two qualities (YM, YN).
find_merge :: AD_Seqs -> AD_Seqs
           -> W.Vector Nucleotides -> W.Vector Qual
           -> W.Vector Nucleotides -> W.Vector Qual
           -> IO (Int, Int, Int)
find_merge :: AD_Seqs
-> AD_Seqs
-> Vector Nucleotides
-> Vector Qual
-> Vector Nucleotides
-> Vector Qual
-> IO (Int, Int, Int)
find_merge pads1 :: AD_Seqs
pads1 pads2 :: AD_Seqs
pads2 r1 :: Vector Nucleotides
r1 q1 :: Vector Qual
q1 r2 :: Vector Nucleotides
r2 q2 :: Vector Qual
q2 =
        Vector Nucleotides
-> Vector Qual
-> (FW_Seq -> IO (Int, Int, Int))
-> IO (Int, Int, Int)
forall r.
Vector Nucleotides -> Vector Qual -> (FW_Seq -> IO r) -> IO r
with_fw_seq Vector Nucleotides
r1 Vector Qual
q1 ((FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \pr1 :: FW_Seq
pr1 ->
        Vector Nucleotides
-> Vector Qual
-> (FW_Seq -> IO (Int, Int, Int))
-> IO (Int, Int, Int)
forall r.
Vector Nucleotides -> Vector Qual -> (FW_Seq -> IO r) -> IO r
with_fw_seq Vector Nucleotides
r2 Vector Qual
q2 ((FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \pr2 :: FW_Seq
pr2 ->
        Vector Nucleotides
-> Vector Qual
-> (RC_Seq -> IO (Int, Int, Int))
-> IO (Int, Int, Int)
forall r.
Vector Nucleotides -> Vector Qual -> (RC_Seq -> IO r) -> IO r
with_rc_seq Vector Nucleotides
r2 Vector Qual
q2 ((RC_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (RC_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \prv2 :: RC_Seq
prv2 -> do
            AD_Seqs
-> AD_Seqs -> FW_Seq -> FW_Seq -> RC_Seq -> IO (Int, Int, Int)
min_merge_score AD_Seqs
pads1 AD_Seqs
pads2 FW_Seq
pr1 FW_Seq
pr2 RC_Seq
prv2

-- | Overlap-merging of read pairs.  We shall compute the likelihood
-- for every possible overlap, then select the most likely one (unless it
-- looks completely random), compute a quality from the second best
-- merge, then merge and clamp the quality accordingly.
-- (We could try looking for chimaera after completing the merge, if
-- only we knew which ones to expect?)
--
-- Two reads go in, with two adapter lists.  We return 'Nothing' if all
-- merges looked mostly random.  Else we return the two original reads,
-- flagged as 'eflagVestigial' *and* the merged version, flagged as
-- 'eflagMerged' and optionally 'eflagTrimmed'.  All reads contain the
-- computed qualities (in YM and YN), which we also return.
--
-- The merging automatically limits quality scores some of the time.  We
-- additionally impose a hard limit of 63 to avoid difficulties
-- representing the result, and even that is ridiculous.  Sane people
-- would further limit the returned quality!  (In practice, map quality
-- later imposes a limit anyway, so no worries...)

mergeBam :: Int -> Int
         -> AD_Seqs -> AD_Seqs
         -> BamRec -> BamRec -> IO [BamRec]
mergeBam :: Int -> Int -> AD_Seqs -> AD_Seqs -> BamRec -> BamRec -> IO [BamRec]
mergeBam lowq :: Int
lowq highq :: Int
highq ads1 :: AD_Seqs
ads1 ads2 :: AD_Seqs
ads2 r1 :: BamRec
r1 r2 :: BamRec
r2 = do
    let len_r1 :: Int
len_r1    = Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length  (Vector_Nucs_half Nucleotides -> Int)
-> Vector_Nucs_half Nucleotides -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
r1
        len_r2 :: Int
len_r2    = Vector_Nucs_half Nucleotides -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length  (Vector_Nucs_half Nucleotides -> Int)
-> Vector_Nucs_half Nucleotides -> Int
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
r2

        b_seq_r1 :: Vector Nucleotides
b_seq_r1  = Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert (Vector_Nucs_half Nucleotides -> Vector Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
r1
        b_seq_r2 :: Vector Nucleotides
b_seq_r2  = Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert (Vector_Nucs_half Nucleotides -> Vector Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
r2
        b_qual_r1 :: Vector Qual
b_qual_r1 = Vector Qual -> Maybe (Vector Qual) -> Vector Qual
forall a. a -> Maybe a -> a
fromMaybe ((Nucleotides -> Qual) -> Vector Nucleotides -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Qual -> Nucleotides -> Qual
forall a b. a -> b -> a
const (Word8 -> Qual
Q 23)) Vector Nucleotides
b_seq_r1) (BamRec -> Maybe (Vector Qual)
b_qual BamRec
r1)
        b_qual_r2 :: Vector Qual
b_qual_r2 = Vector Qual -> Maybe (Vector Qual) -> Vector Qual
forall a. a -> Maybe a -> a
fromMaybe ((Nucleotides -> Qual) -> Vector Nucleotides -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Qual -> Nucleotides -> Qual
forall a b. a -> b -> a
const (Word8 -> Qual
Q 23)) Vector Nucleotides
b_seq_r2) (BamRec -> Maybe (Vector Qual)
b_qual BamRec
r2)

    (mlen :: Int
mlen, qual1 :: Int
qual1, qual2 :: Int
qual2) <- AD_Seqs
-> AD_Seqs
-> Vector Nucleotides
-> Vector Qual
-> Vector Nucleotides
-> Vector Qual
-> IO (Int, Int, Int)
find_merge AD_Seqs
ads1 AD_Seqs
ads2 Vector Nucleotides
b_seq_r1 Vector Qual
b_qual_r1 Vector Nucleotides
b_seq_r2 Vector Qual
b_qual_r2

    let flag_alternative :: BamRec -> BamRec
flag_alternative br :: BamRec
br = BamRec
br { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int -> Ext) -> Int -> Ext
forall a b. (a -> b) -> a -> b
$ Int -> BamKey -> BamRec -> Int
extAsInt 0 "FF" BamRec
br Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
eflagAlternative) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
br }
        store_quals :: BamRec -> BamRec
store_quals      br :: BamRec
br = BamRec
br { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "YM" (Int -> Ext
Int Int
qual1) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamKey -> Ext -> Extensions -> Extensions
updateE "YN" (Int -> Ext
Int Int
qual2) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
br }
        pair_flags :: Int
pair_flags = Int
flagPairedInt -> Int -> Int
forall a. Bits a => a -> a -> a
.|.Int
flagProperlyPairedInt -> Int -> Int
forall a. Bits a => a -> a -> a
.|.Int
flagMateUnmappedInt -> Int -> Int
forall a. Bits a => a -> a -> a
.|.Int
flagMateReversedInt -> Int -> Int
forall a. Bits a => a -> a -> a
.|.Int
flagFirstMateInt -> Int -> Int
forall a. Bits a => a -> a -> a
.|.Int
flagSecondMate

        r1' :: BamRec
r1' = BamRec -> BamRec
store_quals BamRec
r1
        r2' :: BamRec
r2' = BamRec -> BamRec
store_quals BamRec
r2
        rm :: BamRec
rm  = BamRec -> BamRec
store_quals (BamRec -> BamRec) -> BamRec -> BamRec
forall a b. (a -> b) -> a -> b
$ Int -> Word8 -> BamRec
merged_read Int
mlen (Int -> Word8
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Word8) -> Int -> Word8
forall a b. (a -> b) -> a -> b
$ Int -> Int -> Int
forall a. Ord a => a -> a -> a
min 63 Int
qual1)

        merged_read :: Int -> Word8 -> BamRec
merged_read l :: Int
l qmax :: Word8
qmax =
            BamRec
nullBamRec
                { b_qname :: Bytes
b_qname = BamRec -> Bytes
b_qname BamRec
r1
                , b_flag :: Int
b_flag  = Int
flagUnmapped Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int -> Int
forall a. Bits a => a -> a
complement Int
pair_flags Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. BamRec -> Int
b_flag BamRec
r1
                , b_seq :: Vector_Nucs_half Nucleotides
b_seq   = Vector Nucleotides -> Vector_Nucs_half Nucleotides
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert (Vector Nucleotides -> Vector_Nucs_half Nucleotides)
-> Vector Nucleotides -> Vector_Nucs_half Nucleotides
forall a b. (a -> b) -> a -> b
$  Int
-> Vector Nucleotides
-> Vector Qual
-> Vector Nucleotides
-> Vector Qual
-> Vector Nucleotides
forall (v :: * -> *).
(Vector v Nucleotides, Vector v Qual) =>
Int
-> v Nucleotides
-> v Qual
-> v Nucleotides
-> v Qual
-> v Nucleotides
merged_seq Int
l Vector Nucleotides
b_seq_r1 Vector Qual
b_qual_r1 Vector Nucleotides
b_seq_r2 Vector Qual
b_qual_r2
                , b_qual :: Maybe (Vector Qual)
b_qual  = Vector Qual -> Maybe (Vector Qual)
forall a. a -> Maybe a
Just (Vector Qual -> Maybe (Vector Qual))
-> Vector Qual -> Maybe (Vector Qual)
forall a b. (a -> b) -> a -> b
$ Word8
-> Int
-> Vector Nucleotides
-> Vector Qual
-> Vector Nucleotides
-> Vector Qual
-> Vector Qual
forall (v :: * -> *).
(Vector v Nucleotides, Vector v Qual) =>
Word8
-> Int
-> v Nucleotides
-> v Qual
-> v Nucleotides
-> v Qual
-> v Qual
merged_qual Word8
qmax Int
l Vector Nucleotides
b_seq_r1 Vector Qual
b_qual_r1 Vector Nucleotides
b_seq_r2 Vector Qual
b_qual_r2
                , b_exts :: Extensions
b_exts  = let ff :: Int
ff = if Int
l Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len_r1 then Int
eflagTrimmed else 0
                            in BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int -> Ext) -> Int -> Ext
forall a b. (a -> b) -> a -> b
$ Int -> BamKey -> BamRec -> Int
extAsInt 0 "FF" BamRec
r1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
eflagMerged Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
ff) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
r1 }

    [BamRec] -> IO [BamRec]
forall (m :: * -> *) a. Monad m => a -> m a
return ([BamRec] -> IO [BamRec]) -> [BamRec] -> IO [BamRec]
forall a b. (a -> b) -> a -> b
$ case () of
        _ | Vector_Nucs_half Nucleotides -> Bool
forall (v :: * -> *) a. Vector v a => v a -> Bool
V.null (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
r1) Bool -> Bool -> Bool
&& Vector_Nucs_half Nucleotides -> Bool
forall (v :: * -> *) a. Vector v a => v a -> Bool
V.null (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
r2) -> [              ]
          | Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lowq Bool -> Bool -> Bool
|| Int
mlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0               -> [ BamRec
r1', BamRec
r2'     ]
          | Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
highq Bool -> Bool -> Bool
&& Int
mlen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0            -> [              ]
          | Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
highq                         -> [           BamRec
rm ]
          | Int
mlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len_r1Int -> Int -> Int
forall a. Num a => a -> a -> a
-20 Bool -> Bool -> Bool
|| Int
mlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len_r2Int -> Int -> Int
forall a. Num a => a -> a -> a
-20   -> [           BamRec
rm ]
          | Bool
otherwise         -> (BamRec -> BamRec) -> [BamRec] -> [BamRec]
forall a b. (a -> b) -> [a] -> [b]
map BamRec -> BamRec
flag_alternative [ BamRec
r1', BamRec
r2', BamRec
rm ]

{-# INLINE merged_seq #-}
merged_seq :: (V.Vector v Nucleotides, V.Vector v Qual)
           => Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Nucleotides
merged_seq :: Int
-> v Nucleotides
-> v Qual
-> v Nucleotides
-> v Qual
-> v Nucleotides
merged_seq l :: Int
l b_seq_r1 :: v Nucleotides
b_seq_r1 b_qual_r1 :: v Qual
b_qual_r1 b_seq_r2 :: v Nucleotides
b_seq_r2 b_qual_r2 :: v Qual
b_qual_r2 = [v Nucleotides] -> v Nucleotides
forall (v :: * -> *) a. Vector v a => [v a] -> v a
V.concat
        [ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Nucleotides
b_seq_r1
        , (Nucleotides -> Qual -> Nucleotides -> Qual -> Nucleotides)
-> v Nucleotides
-> v Qual
-> v Nucleotides
-> v Qual
-> v Nucleotides
forall (v :: * -> *) a b c d e.
(Vector v a, Vector v b, Vector v c, Vector v d, Vector v e) =>
(a -> b -> c -> d -> e) -> v a -> v b -> v c -> v d -> v e
V.zipWith4 Nucleotides -> Qual -> Nucleotides -> Qual -> Nucleotides
zz          (Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Nucleotides
b_seq_r1)
                                 (Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Qual
b_qual_r1)
                     (v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Nucleotides
b_seq_r2)
                     (v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Qual
b_qual_r2)
        , v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Nucleotides
b_seq_r2 ]
  where
    len_r1 :: Int
len_r1 = v Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length v Qual
b_qual_r1
    len_r2 :: Int
len_r2 = v Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length v Qual
b_qual_r2
    zz :: Nucleotides -> Qual -> Nucleotides -> Qual -> Nucleotides
zz !Nucleotides
n1 (Q !Word8
q1) !Nucleotides
n2 (Q !Word8
q2) | Nucleotides
n1 Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides -> Nucleotides
compls Nucleotides
n2 =        Nucleotides
n1
                               | Word8
q1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
> Word8
q2         =        Nucleotides
n1
                               | Bool
otherwise       = Nucleotides -> Nucleotides
compls Nucleotides
n2

{-# INLINE merged_qual #-}
merged_qual :: (V.Vector v Nucleotides, V.Vector v Qual)
            => Word8 -> Int -> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Qual
merged_qual :: Word8
-> Int
-> v Nucleotides
-> v Qual
-> v Nucleotides
-> v Qual
-> v Qual
merged_qual qmax :: Word8
qmax l :: Int
l b_seq_r1 :: v Nucleotides
b_seq_r1 b_qual_r1 :: v Qual
b_qual_r1 b_seq_r2 :: v Nucleotides
b_seq_r2 b_qual_r2 :: v Qual
b_qual_r2 = [v Qual] -> v Qual
forall (v :: * -> *) a. Vector v a => [v a] -> v a
V.concat
        [ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Qual
b_qual_r1
        , (Nucleotides -> Qual -> Nucleotides -> Qual -> Qual)
-> v Nucleotides -> v Qual -> v Nucleotides -> v Qual -> v Qual
forall (v :: * -> *) a b c d e.
(Vector v a, Vector v b, Vector v c, Vector v d, Vector v e) =>
(a -> b -> c -> d -> e) -> v a -> v b -> v c -> v d -> v e
V.zipWith4 Nucleotides -> Qual -> Nucleotides -> Qual -> Qual
zz           (Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Nucleotides
b_seq_r1)
                                  (Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r2) v Qual
b_qual_r1)
                      (v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Nucleotides -> v Nucleotides) -> v Nucleotides -> v Nucleotides
forall a b. (a -> b) -> a -> b
$ Int -> v Nucleotides -> v Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Nucleotides
b_seq_r2)
                      (v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.drop (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Qual
b_qual_r2)
        , v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => v a -> v a
V.reverse (v Qual -> v Qual) -> v Qual -> v Qual
forall a b. (a -> b) -> a -> b
$ Int -> v Qual -> v Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take (Int
l Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len_r1) v Qual
b_qual_r2 ]
  where
    len_r1 :: Int
len_r1 = v Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length v Qual
b_qual_r1
    len_r2 :: Int
len_r2 = v Qual -> Int
forall (v :: * -> *) a. Vector v a => v a -> Int
V.length v Qual
b_qual_r2
    zz :: Nucleotides -> Qual -> Nucleotides -> Qual -> Qual
zz !Nucleotides
n1 (Q !Word8
q1) !Nucleotides
n2 (Q !Word8
q2) | Nucleotides
n1 Nucleotides -> Nucleotides -> Bool
forall a. Eq a => a -> a -> Bool
== Nucleotides -> Nucleotides
compls Nucleotides
n2 = Word8 -> Qual
Q (Word8 -> Qual) -> Word8 -> Qual
forall a b. (a -> b) -> a -> b
$ Word8 -> Word8 -> Word8
forall a. Ord a => a -> a -> a
min Word8
qmax (Word8
q1 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
+ Word8
q2)
                               | Word8
q1 Word8 -> Word8 -> Bool
forall a. Ord a => a -> a -> Bool
> Word8
q2         = Word8 -> Qual
Q (Word8 -> Qual) -> Word8 -> Qual
forall a b. (a -> b) -> a -> b
$           Word8
q1 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- Word8
q2
                               | Bool
otherwise       = Word8 -> Qual
Q (Word8 -> Qual) -> Word8 -> Qual
forall a b. (a -> b) -> a -> b
$           Word8
q2 Word8 -> Word8 -> Word8
forall a. Num a => a -> a -> a
- Word8
q1



-- | Finds the trimming point.  Input is list of forward adapters,
-- sequence, quality; output is trim point and two qualities (YM, YN).
find_trim :: AD_Seqs
          -> W.Vector Nucleotides -> W.Vector Qual
          -> IO (Int, Int, Int)
find_trim :: AD_Seqs -> Vector Nucleotides -> Vector Qual -> IO (Int, Int, Int)
find_trim pads1 :: AD_Seqs
pads1 r1 :: Vector Nucleotides
r1 q1 :: Vector Qual
q1 =
        [Vector Nucleotides]
-> (AD_Seqs -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall (m :: * -> *) r.
MonadBaseControl IO m =>
[Vector Nucleotides] -> (AD_Seqs -> m r) -> m r
withADSeqs [Vector Nucleotides
forall a. Storable a => Vector a
W.empty]              ((AD_Seqs -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (AD_Seqs -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \pads2 :: AD_Seqs
pads2 ->
        Vector Nucleotides
-> Vector Qual
-> (FW_Seq -> IO (Int, Int, Int))
-> IO (Int, Int, Int)
forall r.
Vector Nucleotides -> Vector Qual -> (FW_Seq -> IO r) -> IO r
with_fw_seq Vector Nucleotides
r1 Vector Qual
q1                   ((FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (FW_Seq -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \pr1 :: FW_Seq
pr1 ->
        AD_Seqs
-> AD_Seqs -> FW_Seq -> FW_Seq -> RC_Seq -> IO (Int, Int, Int)
min_merge_score AD_Seqs
pads1 AD_Seqs
pads2 FW_Seq
pr1 (Ptr Nucleotides -> Ptr Qual -> Int -> FW_Seq
FW_Seq Ptr Nucleotides
forall a. Ptr a
nullPtr Ptr Qual
forall a. Ptr a
nullPtr 0) (Ptr Nucleotides -> Ptr Qual -> Int -> RC_Seq
RC_Seq Ptr Nucleotides
forall a. Ptr a
nullPtr Ptr Qual
forall a. Ptr a
nullPtr 0)

-- | Trimming for a single read:  we need one adapter only (the one coming
-- /after/ the read), here provided as a list of options, and then we
-- merge with an empty second read.  Results in up to two reads (the
-- original, possibly flagged, and the trimmed one, definitely flagged,
-- and two qualities).
trimBam :: Int -> Int -> AD_Seqs -> BamRec -> IO [BamRec]
trimBam :: Int -> Int -> AD_Seqs -> BamRec -> IO [BamRec]
trimBam lowq :: Int
lowq highq :: Int
highq ads1 :: AD_Seqs
ads1 r1 :: BamRec
r1 = do
    let b_seq_r1 :: Vector Nucleotides
b_seq_r1 = Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall (v :: * -> *) a (w :: * -> *).
(Vector v a, Vector w a) =>
v a -> w a
V.convert (Vector_Nucs_half Nucleotides -> Vector Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector Nucleotides
forall a b. (a -> b) -> a -> b
$ BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
r1
    (mlen :: Int
mlen, qual1 :: Int
qual1, qual2 :: Int
qual2) <- AD_Seqs -> Vector Nucleotides -> Vector Qual -> IO (Int, Int, Int)
find_trim AD_Seqs
ads1 Vector Nucleotides
b_seq_r1 (Vector Qual -> IO (Int, Int, Int))
-> Vector Qual -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$
                            Vector Qual -> Maybe (Vector Qual) -> Vector Qual
forall a. a -> Maybe a -> a
fromMaybe ((Nucleotides -> Qual) -> Vector Nucleotides -> Vector Qual
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
V.map (Qual -> Nucleotides -> Qual
forall a b. a -> b -> a
const (Word8 -> Qual
Q 23)) Vector Nucleotides
b_seq_r1) (BamRec -> Maybe (Vector Qual)
b_qual BamRec
r1)

    let flag_alternative :: BamRec -> BamRec
flag_alternative br :: BamRec
br = BamRec
br { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int -> Ext) -> Int -> Ext
forall a b. (a -> b) -> a -> b
$ Int -> BamKey -> BamRec -> Int
extAsInt 0 "FF" BamRec
br Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
eflagAlternative) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
br }
        store_quals :: BamRec -> BamRec
store_quals      br :: BamRec
br = BamRec
br { b_exts :: Extensions
b_exts = BamKey -> Ext -> Extensions -> Extensions
updateE "YM" (Int -> Ext
Int Int
qual1) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamKey -> Ext -> Extensions -> Extensions
updateE "YN" (Int -> Ext
Int Int
qual2) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
br }

        r1' :: BamRec
r1'  = BamRec -> BamRec
store_quals BamRec
r1
        r1t :: BamRec
r1t  = BamRec -> BamRec
store_quals (BamRec -> BamRec) -> BamRec -> BamRec
forall a b. (a -> b) -> a -> b
$ Int -> BamRec
trimmed_read Int
mlen

        trimmed_read :: Int -> BamRec
trimmed_read l :: Int
l = BamRec
nullBamRec {
                b_qname :: Bytes
b_qname = BamRec -> Bytes
b_qname BamRec
r1,
                b_flag :: Int
b_flag  = Int
flagUnmapped Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. BamRec -> Int
b_flag BamRec
r1,
                b_seq :: Vector_Nucs_half Nucleotides
b_seq   = Int -> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l  (Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides)
-> Vector_Nucs_half Nucleotides -> Vector_Nucs_half Nucleotides
forall a b. (a -> b) -> a -> b
$  BamRec -> Vector_Nucs_half Nucleotides
b_seq  BamRec
r1,
                b_qual :: Maybe (Vector Qual)
b_qual  = Int -> Vector Qual -> Vector Qual
forall (v :: * -> *) a. Vector v a => Int -> v a -> v a
V.take Int
l (Vector Qual -> Vector Qual)
-> Maybe (Vector Qual) -> Maybe (Vector Qual)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> BamRec -> Maybe (Vector Qual)
b_qual BamRec
r1,
                b_exts :: Extensions
b_exts  = BamKey -> Ext -> Extensions -> Extensions
updateE "FF" (Int -> Ext
Int (Int -> Ext) -> Int -> Ext
forall a b. (a -> b) -> a -> b
$ Int -> BamKey -> BamRec -> Int
extAsInt 0 "FF" BamRec
r1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. Int
eflagTrimmed) (Extensions -> Extensions) -> Extensions -> Extensions
forall a b. (a -> b) -> a -> b
$ BamRec -> Extensions
b_exts BamRec
r1 }

    [BamRec] -> IO [BamRec]
forall (m :: * -> *) a. Monad m => a -> m a
return ([BamRec] -> IO [BamRec]) -> [BamRec] -> IO [BamRec]
forall a b. (a -> b) -> a -> b
$ case () of
        _ | Vector_Nucs_half Nucleotides -> Bool
forall (v :: * -> *) a. Vector v a => v a -> Bool
V.null (BamRec -> Vector_Nucs_half Nucleotides
b_seq BamRec
r1)              -> [          ]
          | Int
mlen Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== 0 Bool -> Bool -> Bool
&& Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
highq    -> [          ]
          | Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
lowq Bool -> Bool -> Bool
|| Int
mlen Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< 0       -> [ BamRec
r1'      ]
          | Int
qual1 Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
highq                 -> [      BamRec
r1t ]
          | Bool
otherwise -> (BamRec -> BamRec) -> [BamRec] -> [BamRec]
forall a b. (a -> b) -> [a] -> [b]
map BamRec -> BamRec
flag_alternative [ BamRec
r1', BamRec
r1t ]


-- | For merging, we don't need the complete adapters (length around 70!),
-- only a sufficient prefix.  Taking only the more-or-less constant
-- part (length around 30), there aren't all that many different
-- adapters in the world.  To deal with pretty much every library, we
-- only need the following forward adapters, which will be the default
-- (defined here in the direction they would be sequenced in):  Genomic
-- R2, Multiplex R2, Fraft P7.

default_fwd_adapters :: [W.Vector Nucleotides]
default_fwd_adapters :: [Vector Nucleotides]
default_fwd_adapters = ([Char] -> Vector Nucleotides) -> [[Char]] -> [Vector Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map ([Nucleotides] -> Vector Nucleotides
forall a. Storable a => [a] -> Vector a
W.fromList ([Nucleotides] -> Vector Nucleotides)
-> ([Char] -> [Nucleotides]) -> [Char] -> Vector Nucleotides
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (Word8 -> Nucleotides) -> [Word8] -> [Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map Word8 -> Nucleotides
toNucleotides ([Word8] -> [Nucleotides])
-> ([Char] -> [Word8]) -> [Char] -> [Nucleotides]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (Char -> Word8) -> [Char] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Word8
c2w)
         [ {- Genomic R2   -}  "AGATCGGAAGAGCGGTTCAG"
         , {- Multiplex R2 -}  "AGATCGGAAGAGCACACGTC"
         , {- Graft P7     -}  "AGATCGGAAGAGCTCGTATG" ]

-- | Like 'default_rev_adapters', these are the few adapters needed for
-- the reverse read (defined in the direction they would be sequenced in
-- as part of the second read):  Genomic R1, CL 72.

default_rev_adapters :: [W.Vector Nucleotides]
default_rev_adapters :: [Vector Nucleotides]
default_rev_adapters = ([Char] -> Vector Nucleotides) -> [[Char]] -> [Vector Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map ([Nucleotides] -> Vector Nucleotides
forall a. Storable a => [a] -> Vector a
W.fromList ([Nucleotides] -> Vector Nucleotides)
-> ([Char] -> [Nucleotides]) -> [Char] -> Vector Nucleotides
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (Word8 -> Nucleotides) -> [Word8] -> [Nucleotides]
forall a b. (a -> b) -> [a] -> [b]
map Word8 -> Nucleotides
toNucleotides ([Word8] -> [Nucleotides])
-> ([Char] -> [Word8]) -> [Char] -> [Nucleotides]
forall k (cat :: k -> k -> *) (b :: k) (c :: k) (a :: k).
Category cat =>
cat b c -> cat a b -> cat a c
. (Char -> Word8) -> [Char] -> [Word8]
forall a b. (a -> b) -> [a] -> [b]
map Char -> Word8
c2w)
         [ {- Genomic_R1   -}  "AGATCGGAAGAGCGTCGTGT"
         , {- CL72         -}  "GGAAGAGCGTCGTGTAGGGA" ]

-- We need to compute the likelihood of a read pair given an assumed
-- insert length.  The likelihood of the first read is the likelihood of
-- a match with the adapter where it overlaps the 3' adapter, elsewhere
-- it's 1/4 per position.  The likelihood of the second read is the
-- likelihood of a match with the adapter where it overlaps the adapter,
-- the likehood of a read-read match where it overlaps read one, 1/4 per
-- position elsewhere.  (Yes, this ignores base composition.  It doesn't
-- matter enough.)

min_merge_score
    :: AD_Seqs              -- 3' adapters as they appear in the first read
    -> AD_Seqs              -- 5' adapters as they appear in the second read
    -> FW_Seq               -- first read, prepped
    -> FW_Seq               -- second read, qual, prepped
    -> RC_Seq               -- second read, qual, reversed and prepped
    -> IO (Int,Int,Int)     -- best length, min score, 2nd min score
min_merge_score :: AD_Seqs
-> AD_Seqs -> FW_Seq -> FW_Seq -> RC_Seq -> IO (Int, Int, Int)
min_merge_score (AD_Seqs !Ptr (Ptr Nucleotides)
p_fwd_ads !Ptr CInt
p_fwd_lns !Int
n_fwd_ads) (AD_Seqs !Ptr (Ptr Nucleotides)
p_rev_ads !Ptr CInt
p_rev_lns !Int
n_rev_ads)
            (FW_Seq !Ptr Nucleotides
p_rd1 !Ptr Qual
p_qs1 !Int
l1) (FW_Seq !Ptr Nucleotides
p_rd2 !Ptr Qual
p_qs2 !Int
l2) (RC_Seq !Ptr Nucleotides
p_rrd2 !Ptr Qual
p_rqs2 _) =
    Int -> (Ptr CInt -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. Storable a => Int -> (Ptr a -> IO b) -> IO b
allocaArray 2 ((Ptr CInt -> IO (Int, Int, Int)) -> IO (Int, Int, Int))
-> (Ptr CInt -> IO (Int, Int, Int)) -> IO (Int, Int, Int)
forall a b. (a -> b) -> a -> b
$ \pmins :: Ptr CInt
pmins ->
        (Int -> Int -> Int -> (Int, Int, Int))
-> IO Int -> IO Int -> IO Int -> IO (Int, Int, Int)
forall (m :: * -> *) a1 a2 a3 r.
Monad m =>
(a1 -> a2 -> a3 -> r) -> m a1 -> m a2 -> m a3 -> m r
liftM3 (,,)
               (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> Int) -> IO CInt -> IO Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                Ptr (Ptr Nucleotides)
-> Ptr CInt
-> CInt
-> Ptr (Ptr Nucleotides)
-> Ptr CInt
-> CInt
-> Ptr Nucleotides
-> Ptr Qual
-> CInt
-> Ptr Nucleotides
-> Ptr Qual
-> CInt
-> Ptr Nucleotides
-> Ptr Qual
-> Ptr CInt
-> IO CInt
prim_merge_score Ptr (Ptr Nucleotides)
p_fwd_ads Ptr CInt
p_fwd_lns (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n_fwd_ads)
                                 Ptr (Ptr Nucleotides)
p_rev_ads Ptr CInt
p_rev_lns (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
n_rev_ads)
                                 Ptr Nucleotides
p_rd1 Ptr Qual
p_qs1 (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l1)
                                 Ptr Nucleotides
p_rd2 Ptr Qual
p_qs2 (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l2)
                                 Ptr Nucleotides
p_rrd2 Ptr Qual
p_rqs2 Ptr CInt
pmins)
               (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> Int) -> IO CInt -> IO Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Ptr CInt -> Int -> IO CInt
forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
pmins 0)
               (CInt -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (CInt -> Int) -> IO CInt -> IO Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Ptr CInt -> Int -> IO CInt
forall a. Storable a => Ptr a -> Int -> IO a
peekElemOff Ptr CInt
pmins 1)

foreign import ccall unsafe "prim_merge_score"
    prim_merge_score :: Ptr (Ptr Nucleotides) -> Ptr CInt -> CInt
                     -> Ptr (Ptr Nucleotides) -> Ptr CInt -> CInt
                     -> Ptr Nucleotides -> Ptr Qual -> CInt
                     -> Ptr Nucleotides -> Ptr Qual -> CInt
                     -> Ptr Nucleotides -> Ptr Qual
                     -> Ptr CInt -> IO CInt



data AD_Seqs = AD_Seqs !(Ptr (Ptr Nucleotides)) !(Ptr CInt) !Int
data FW_Seq  = FW_Seq !(Ptr Nucleotides) !(Ptr Qual) !Int
data RC_Seq  = RC_Seq !(Ptr Nucleotides) !(Ptr Qual) !Int

-- Maybe pad with something suitable?
withADSeqs :: MonadBaseControl IO m => [W.Vector Nucleotides] -> (AD_Seqs -> m r) -> m r
withADSeqs :: [Vector Nucleotides] -> (AD_Seqs -> m r) -> m r
withADSeqs ads0 :: [Vector Nucleotides]
ads0 k :: AD_Seqs -> m r
k =
    (RunInBase m IO -> IO (StM m r)) -> m r
forall (b :: * -> *) (m :: * -> *) a.
MonadBaseControl b m =>
(RunInBase m b -> b (StM m a)) -> m a
control                                                 ((RunInBase m IO -> IO (StM m r)) -> m r)
-> (RunInBase m IO -> IO (StM m r)) -> m r
forall a b. (a -> b) -> a -> b
$ \run_io :: RunInBase m IO
run_io ->
    Int -> (Ptr (Ptr Nucleotides) -> IO (StM m r)) -> IO (StM m r)
forall a b. Storable a => Int -> (Ptr a -> IO b) -> IO b
allocaArray ([Vector Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Nucleotides]
ads0)                               ((Ptr (Ptr Nucleotides) -> IO (StM m r)) -> IO (StM m r))
-> (Ptr (Ptr Nucleotides) -> IO (StM m r)) -> IO (StM m r)
forall a b. (a -> b) -> a -> b
$ \pps :: Ptr (Ptr Nucleotides)
pps ->
    Int -> (Ptr CInt -> IO (StM m r)) -> IO (StM m r)
forall a b. Storable a => Int -> (Ptr a -> IO b) -> IO b
allocaArray ([Vector Nucleotides] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Nucleotides]
ads0)                               ((Ptr CInt -> IO (StM m r)) -> IO (StM m r))
-> (Ptr CInt -> IO (StM m r)) -> IO (StM m r)
forall a b. (a -> b) -> a -> b
$ \pls :: Ptr CInt
pls ->
    let go :: Int -> [Vector Nucleotides] -> IO (StM m r)
go !Int
n [    ] = m r -> IO (StM m r)
RunInBase m IO
run_io (AD_Seqs -> m r
k (AD_Seqs -> m r) -> AD_Seqs -> m r
forall a b. (a -> b) -> a -> b
$! Ptr (Ptr Nucleotides) -> Ptr CInt -> Int -> AD_Seqs
AD_Seqs Ptr (Ptr Nucleotides)
pps Ptr CInt
pls Int
n)
        go !Int
n (v :: Vector Nucleotides
v:vs :: [Vector Nucleotides]
vs) = Vector Nucleotides
-> (Ptr Nucleotides -> IO (StM m r)) -> IO (StM m r)
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
W.unsafeWith Vector Nucleotides
v ((Ptr Nucleotides -> IO (StM m r)) -> IO (StM m r))
-> (Ptr Nucleotides -> IO (StM m r)) -> IO (StM m r)
forall a b. (a -> b) -> a -> b
$ \pa :: Ptr Nucleotides
pa -> do
                       Ptr (Ptr Nucleotides) -> Int -> Ptr Nucleotides -> IO ()
forall a. Storable a => Ptr a -> Int -> a -> IO ()
pokeElemOff Ptr (Ptr Nucleotides)
pps Int
n Ptr Nucleotides
pa
                       Ptr CInt -> Int -> CInt -> IO ()
forall a. Storable a => Ptr a -> Int -> a -> IO ()
pokeElemOff Ptr CInt
pls Int
n (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Vector Nucleotides -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Nucleotides
v))
                       Int -> [Vector Nucleotides] -> IO (StM m r)
go (Int -> Int
forall a. Enum a => a -> a
succ Int
n) [Vector Nucleotides]
vs
    in Int -> [Vector Nucleotides] -> IO (StM m r)
go 0 [Vector Nucleotides]
ads0

-- Maybe pad with something suitable?
with_fw_seq :: W.Vector Nucleotides -> W.Vector Qual -> (FW_Seq -> IO r) -> IO r
with_fw_seq :: Vector Nucleotides -> Vector Qual -> (FW_Seq -> IO r) -> IO r
with_fw_seq ns :: Vector Nucleotides
ns qs :: Vector Qual
qs k :: FW_Seq -> IO r
k
    | Vector Nucleotides -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Nucleotides
ns Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Vector Qual -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Qual
qs
        = Vector Nucleotides -> (Ptr Nucleotides -> IO r) -> IO r
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
W.unsafeWith Vector Nucleotides
ns    ((Ptr Nucleotides -> IO r) -> IO r)
-> (Ptr Nucleotides -> IO r) -> IO r
forall a b. (a -> b) -> a -> b
$ \p_ns :: Ptr Nucleotides
p_ns ->
          Vector Qual -> (Ptr Qual -> IO r) -> IO r
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
W.unsafeWith Vector Qual
qs    ((Ptr Qual -> IO r) -> IO r) -> (Ptr Qual -> IO r) -> IO r
forall a b. (a -> b) -> a -> b
$ \p_qs :: Ptr Qual
p_qs ->
          FW_Seq -> IO r
k (Ptr Nucleotides -> Ptr Qual -> Int -> FW_Seq
FW_Seq Ptr Nucleotides
p_ns Ptr Qual
p_qs (Int -> FW_Seq) -> Int -> FW_Seq
forall a b. (a -> b) -> a -> b
$ Vector Nucleotides -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Nucleotides
ns)
    | Bool
otherwise
        = LengthMismatch -> IO r
forall e a. Exception e => e -> IO a
throwIO (LengthMismatch -> IO r) -> LengthMismatch -> IO r
forall a b. (a -> b) -> a -> b
$ Bytes -> LengthMismatch
LengthMismatch "forward adapter"
{-# INLINE with_fw_seq #-}

-- Maybe pad with something suitable?
with_rc_seq :: W.Vector Nucleotides -> W.Vector Qual -> (RC_Seq -> IO r) -> IO r
with_rc_seq :: Vector Nucleotides -> Vector Qual -> (RC_Seq -> IO r) -> IO r
with_rc_seq ns :: Vector Nucleotides
ns qs :: Vector Qual
qs k :: RC_Seq -> IO r
k
    | Vector Nucleotides -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Nucleotides
ns Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Vector Qual -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Qual
qs
        = Vector Nucleotides -> (Ptr Nucleotides -> IO r) -> IO r
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
W.unsafeWith (Vector Nucleotides -> Vector Nucleotides
forall a. Storable a => Vector a -> Vector a
W.reverse (Vector Nucleotides -> Vector Nucleotides)
-> Vector Nucleotides -> Vector Nucleotides
forall a b. (a -> b) -> a -> b
$ (Nucleotides -> Nucleotides)
-> Vector Nucleotides -> Vector Nucleotides
forall a b.
(Storable a, Storable b) =>
(a -> b) -> Vector a -> Vector b
W.map Nucleotides -> Nucleotides
compls Vector Nucleotides
ns)    ((Ptr Nucleotides -> IO r) -> IO r)
-> (Ptr Nucleotides -> IO r) -> IO r
forall a b. (a -> b) -> a -> b
$ \p_rns :: Ptr Nucleotides
p_rns ->
          Vector Qual -> (Ptr Qual -> IO r) -> IO r
forall a b. Storable a => Vector a -> (Ptr a -> IO b) -> IO b
W.unsafeWith (Vector Qual -> Vector Qual
forall a. Storable a => Vector a -> Vector a
W.reverse Vector Qual
qs)                   ((Ptr Qual -> IO r) -> IO r) -> (Ptr Qual -> IO r) -> IO r
forall a b. (a -> b) -> a -> b
$ \p_rqs :: Ptr Qual
p_rqs ->
          RC_Seq -> IO r
k (Ptr Nucleotides -> Ptr Qual -> Int -> RC_Seq
RC_Seq Ptr Nucleotides
p_rns Ptr Qual
p_rqs (Int -> RC_Seq) -> Int -> RC_Seq
forall a b. (a -> b) -> a -> b
$ Vector Nucleotides -> Int
forall a. Storable a => Vector a -> Int
W.length Vector Nucleotides
ns)
    | Bool
otherwise
        = LengthMismatch -> IO r
forall e a. Exception e => e -> IO a
throwIO (LengthMismatch -> IO r) -> LengthMismatch -> IO r
forall a b. (a -> b) -> a -> b
$ Bytes -> LengthMismatch
LengthMismatch "reverse adapter"
{-# INLINE with_rc_seq #-}