module Bio.Sequence.SFF_filters where
import Bio.Sequence.SFF (ReadBlock(..), ReadHeader(..), flowToBasePos, flowgram)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (tails)
type DiscardFilter = ReadBlock -> Bool
filter_dots, filter_mixed, filter_key, filter_empty :: DiscardFilter
filter_empty rb = num_bases (read_header rb) >= 5
filter_key rb = ("TCAG"==) $ take 4 $ BL.unpack $ bases rb
filter_dots rb = let dots = BL.length $ BL.filter (=='N') $ bases rb
in fromIntegral dots / fromIntegral (BL.length $ bases rb) < (0.05 :: Double)
filter_mixed rb = let fs = dropWhile (<50) . reverse . flowgram $ rb
fl = dlength fs
in and [ (dlength (filter (>50) fs) / fl) > 0.7
, (dlength (filter (<45) fs) / fl) > 0.3
, (dlength (filter (>75) fs) / fl) > 0.3
, (dlength (filter (\f -> f<=75 && f>=45) fs) / fl) < 0.2
]
filter_length :: Int -> DiscardFilter
filter_length n rb = length (flowgram rb) >= n
type TrimFilter = ReadBlock -> ReadBlock
filter_sigint, filter_qual20 :: TrimFilter
filter_sigint rb = clipFlows rb (sigint rb)
sigint :: ReadBlock -> Int
sigint rb = let bs = scanl (\(n,m,_) f -> if f >= 50 && f <= 70 then (n+1,m+1,f) else (n,m+1,f)) (0,0,0) $ flowgram rb
in length $ reverse
$ dropWhile (\(_,_,f) -> f<=70)
$ dropWhile (\(n,m,_)->n `div` (m*1000) > (30::Int)) $ reverse bs
filter_qual20 rs = clipSeq rs $ qual20 rs
qual20 :: ReadBlock -> Int
qual20 rs = (fromIntegral $ num_bases $ read_header rs)
(length . takeWhile (<20) . map (avg . take 10) . tails . reverse . B.unpack $ quality rs)
dlength :: [a] -> Double
dlength = fromIntegral . length
avg :: Integral a => [a] -> Double
avg xs = fromIntegral (sum xs) / dlength xs
clipFlows :: ReadBlock -> Int -> ReadBlock
clipFlows rb n = clipSeq rb (flowToBasePos rb n)
clipSeq :: ReadBlock -> Int -> ReadBlock
clipSeq rb n' = let n = fromIntegral n'
rh = read_header rb
in if clip_qual_right rh <= n then rb else rb { read_header = rh {clip_qual_right = n }}