module Bio.ABI.Clean
( Cleanable (..)
, Thresholds (..)
, defaultThresholds
) where
import Control.Monad (guard, join)
import qualified Data.Vector as V
import Bio.Sequence (mean, meanInRange)
import qualified Bio.Sequence as S (drop, length, reverse, tail, take)
import Bio.Sequence.Basecalled (BasecalledSequence, BasecalledSequenceWithRawData (..))
class Cleanable a where
clean :: a -> Maybe a
clean = Thresholds -> a -> Maybe a
forall a. Cleanable a => Thresholds -> a -> Maybe a
cleanWith Thresholds
defaultThresholds
cleanWith :: Thresholds -> a -> Maybe a
data Thresholds
= Thresholds
{ Thresholds -> Int
frameSize :: Int
, Thresholds -> Double
edgeThreshold :: Double
, Thresholds -> Double
innerThreshold :: Double
}
deriving (Thresholds -> Thresholds -> Bool
(Thresholds -> Thresholds -> Bool)
-> (Thresholds -> Thresholds -> Bool) -> Eq Thresholds
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Thresholds -> Thresholds -> Bool
$c/= :: Thresholds -> Thresholds -> Bool
== :: Thresholds -> Thresholds -> Bool
$c== :: Thresholds -> Thresholds -> Bool
Eq, Int -> Thresholds -> ShowS
[Thresholds] -> ShowS
Thresholds -> String
(Int -> Thresholds -> ShowS)
-> (Thresholds -> String)
-> ([Thresholds] -> ShowS)
-> Show Thresholds
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Thresholds] -> ShowS
$cshowList :: [Thresholds] -> ShowS
show :: Thresholds -> String
$cshow :: Thresholds -> String
showsPrec :: Int -> Thresholds -> ShowS
$cshowsPrec :: Int -> Thresholds -> ShowS
Show)
defaultThresholds :: Thresholds
defaultThresholds :: Thresholds
defaultThresholds = Int -> Double -> Double -> Thresholds
Thresholds Int
10 Double
20 Double
30
instance Cleanable BasecalledSequence where
cleanWith :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
cleanWith Thresholds
thr BasecalledSequence
input = do
BasecalledSequence
cut <- Maybe BasecalledSequence
fromBoth
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds
thr BasecalledSequence
cut
BasecalledSequence -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequence
cut
where
fromLeft :: Maybe BasecalledSequence
fromLeft = Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
thr BasecalledSequence
input
fromBoth :: Maybe BasecalledSequence
fromBoth = (BasecalledSequence -> BasecalledSequence)
-> Maybe BasecalledSequence -> Maybe BasecalledSequence
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse
(Maybe BasecalledSequence -> Maybe BasecalledSequence)
-> (Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence)
-> Maybe (Maybe BasecalledSequence)
-> Maybe BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => m (m a) -> m a
join
(Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence)
-> Maybe (Maybe BasecalledSequence) -> Maybe BasecalledSequence
forall a b. (a -> b) -> a -> b
$ Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
thr
(BasecalledSequence -> Maybe BasecalledSequence)
-> (BasecalledSequence -> BasecalledSequence)
-> BasecalledSequence
-> Maybe BasecalledSequence
forall b c a. (b -> c) -> (a -> b) -> a -> c
. BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse
(BasecalledSequence -> Maybe BasecalledSequence)
-> Maybe BasecalledSequence -> Maybe (Maybe BasecalledSequence)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe BasecalledSequence
fromLeft
instance Cleanable BasecalledSequenceWithRawData where
cleanWith :: Thresholds
-> BasecalledSequenceWithRawData
-> Maybe BasecalledSequenceWithRawData
cleanWith Thresholds
thr input :: BasecalledSequenceWithRawData
input@BasecalledSequenceWithRawData{Vector Int
Vector Int16
BasecalledSequence
bsPeakLocations :: BasecalledSequenceWithRawData -> Vector Int
bsRawC :: BasecalledSequenceWithRawData -> Vector Int16
bsRawT :: BasecalledSequenceWithRawData -> Vector Int16
bsRawA :: BasecalledSequenceWithRawData -> Vector Int16
bsRawG :: BasecalledSequenceWithRawData -> Vector Int16
bsSequence :: BasecalledSequenceWithRawData -> BasecalledSequence
bsPeakLocations :: Vector Int
bsRawC :: Vector Int16
bsRawT :: Vector Int16
bsRawA :: Vector Int16
bsRawG :: Vector Int16
bsSequence :: BasecalledSequence
..} = do
Int
toDropLeft <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
thr BasecalledSequence
bsSequence
let leftDroppedSequ :: BasecalledSequence
leftDroppedSequ = Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.drop Int
toDropLeft BasecalledSequence
bsSequence
let leftDroppedPloc :: Vector Int
leftDroppedPloc = Int -> Vector Int -> Vector Int
forall a. Int -> Vector a -> Vector a
V.drop Int
toDropLeft Vector Int
bsPeakLocations
Int
toDropRight <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
thr (BasecalledSequence -> Maybe Int)
-> BasecalledSequence -> Maybe Int
forall a b. (a -> b) -> a -> b
$ BasecalledSequence -> BasecalledSequence
forall s. IsSequence s => s -> s
S.reverse BasecalledSequence
leftDroppedSequ
let rightDroppedSequ :: BasecalledSequence
rightDroppedSequ = Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.take (BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
leftDroppedSequ Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
toDropRight) BasecalledSequence
leftDroppedSequ
let rightDroppedPloc :: Vector Int
rightDroppedPloc = Int -> Vector Int -> Vector Int
forall a. Int -> Vector a -> Vector a
V.take (Vector Int -> Int
forall a. Vector a -> Int
V.length Vector Int
leftDroppedPloc Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
toDropRight) Vector Int
leftDroppedPloc
Bool -> Maybe ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Maybe ()) -> Bool -> Maybe ()
forall a b. (a -> b) -> a -> b
$ Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds
thr BasecalledSequence
rightDroppedSequ
BasecalledSequenceWithRawData
-> Maybe BasecalledSequenceWithRawData
forall (m :: * -> *) a. Monad m => a -> m a
return BasecalledSequenceWithRawData
input { bsSequence :: BasecalledSequence
bsSequence = BasecalledSequence
rightDroppedSequ, bsPeakLocations :: Vector Int
bsPeakLocations = Vector Int
rightDroppedPloc }
checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner :: Thresholds -> BasecalledSequence -> Bool
checkInner Thresholds{Double
Int
innerThreshold :: Double
edgeThreshold :: Double
frameSize :: Int
innerThreshold :: Thresholds -> Double
edgeThreshold :: Thresholds -> Double
frameSize :: Thresholds -> Int
..} = (Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
> Double
innerThreshold) (Double -> Bool)
-> (BasecalledSequence -> Double) -> BasecalledSequence -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. BasecalledSequence -> Double
forall s. ContainsWeight s => s -> Double
mean
doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge :: Thresholds -> BasecalledSequence -> Maybe BasecalledSequence
doCutEdge Thresholds
t BasecalledSequence
sequ = do
Int
toDrop <- Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
t BasecalledSequence
sequ
BasecalledSequence -> Maybe BasecalledSequence
forall (m :: * -> *) a. Monad m => a -> m a
return (BasecalledSequence -> Maybe BasecalledSequence)
-> BasecalledSequence -> Maybe BasecalledSequence
forall a b. (a -> b) -> a -> b
$ Int -> BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => Int -> s -> s
S.drop Int
toDrop BasecalledSequence
sequ
cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge :: Thresholds -> BasecalledSequence -> Maybe Int
cutEdge t :: Thresholds
t@Thresholds{Double
Int
innerThreshold :: Double
edgeThreshold :: Double
frameSize :: Int
innerThreshold :: Thresholds -> Double
edgeThreshold :: Thresholds -> Double
frameSize :: Thresholds -> Int
..} BasecalledSequence
sequ | BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
frameSize = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
0
| Double
meanInR Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
edgeThreshold Bool -> Bool -> Bool
&& BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
1 = (Int
1Int -> Int -> Int
forall a. Num a => a -> a -> a
+) (Int -> Int) -> Maybe Int -> Maybe Int
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Thresholds -> BasecalledSequence -> Maybe Int
cutEdge Thresholds
t (BasecalledSequence -> BasecalledSequence
forall s. ContainsNoMarking s => s -> s
S.tail BasecalledSequence
sequ)
| BasecalledSequence -> Int
forall s. IsSequence s => s -> Int
S.length BasecalledSequence
sequ Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
frameSize = Int -> Maybe Int
forall a. a -> Maybe a
Just Int
frameSize
| Bool
otherwise = Maybe Int
forall a. Maybe a
Nothing
where
meanInR :: Double
meanInR = BasecalledSequence -> RangeInclusive -> Double
forall s. ContainsWeight s => s -> RangeInclusive -> Double
meanInRange BasecalledSequence
sequ (Int
0, Int
frameSize Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1)