module Bio.Sequence.SFF_filters where
import Bio.Sequence.SFF (ReadBlock(..), ReadHeader(..)
, flowToBasePos, flowgram, cumulative_index)
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString as SB
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.List (tails)
import Data.Char (toUpper)
type DiscardFilter = ReadBlock -> Bool
discard_empty :: DiscardFilter
discard_empty rb = num_bases (read_header rb) >= 5
discard_key :: String -> DiscardFilter
discard_key key rb = (map toUpper key==) $ take (length key) $ BL.unpack $ bases rb
discard_dots :: Double -> DiscardFilter
discard_dots p rb = let dotcount = SB.length $ SB.filter (>3) $ flow_index rb
in fromIntegral dotcount / fromIntegral (BL.length $ bases rb) < p
&& last (cumulative_index rb) >= 84
discard_mixed :: DiscardFilter
discard_mixed rb = let fs = dropWhile (<50) . reverse . flowgram $ rb
fl = dlength fs
in and
[ (dlength (filter (>50) fs) / fl) < 0.7
, (dlength (filter (<45) fs) / fl) > 0.3
, (dlength (filter (>75) fs) / fl) > 0.3
, (dlength (filter (\f -> f<=75 && f>=45) fs) / fl) < 0.2
]
discard_length :: Int -> DiscardFilter
discard_length n rb = length (flowgram rb) >= n
type TrimFilter = ReadBlock -> ReadBlock
trim_sigint :: TrimFilter
trim_sigint rb = clipSeq rb (sigint rb)
sigint :: ReadBlock -> Int
sigint rb = let bs = drop 1 $ scanl (\(n,m,_) f -> if f >= 50 && f <= 70 then (n+1,m+1,f) else (n,m+1,f)) (0,0,0) $ flowgram rb
xs = dropWhile (\(_,_,f) -> f<=70)
$ dropWhile (\(n,m,_)->(1000*n) `div` m > (30::Int))
$ reverse bs
in case xs of [] -> error "no sequence left?"
((_,m,_):_) -> flowToBasePos rb m
trim_primer :: String -> TrimFilter
trim_primer s rb = clipSeq rb (find_primer s rb)
find_primer :: String -> ReadBlock -> Int
find_primer s rb = go (num_bases (read_header rb) 10)
where go i | i <= 5 = fromIntegral (num_bases $ read_header rb)
| match i = fromIntegral i
| otherwise = go (i1)
match j = s' `B.isPrefixOf` B.drop (fromIntegral j) (bases rb)
s' = BL.pack $ map toUpper $ take 14 s
trim_qual20 :: Int -> TrimFilter
trim_qual20 w rs = clipSeq rs $ qual20 w rs
qual20 :: Int -> ReadBlock -> Int
qual20 w rs = (fromIntegral $ num_bases $ read_header rs)
(length . takeWhile (<20) . map (avg . take w) . tails . reverse . B.unpack $ quality rs)
dlength :: [a] -> Double
dlength = fromIntegral . length
avg :: Integral a => [a] -> Double
avg xs = sum (map fromIntegral xs) / dlength xs
clipFlows :: ReadBlock -> Int -> ReadBlock
clipFlows rb n = clipSeq rb (flowToBasePos rb n)
clipSeq :: ReadBlock -> Int -> ReadBlock
clipSeq rb n' = let n = fromIntegral n'
rh = read_header rb
in if clip_qual_right rh <= n then rb else rb { read_header = rh {clip_qual_right = n }}
flx_linker = "GTTGGAACCGAAAGGGTTTGAATTCAAACCCTTTCGGTTCCAAC"
ti_linker = "TCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACG"
rna_adapter = "ggcgggcgatgtctcgtctgagcgggctggcaaggc"
rna_adapter2 = "ttcgcagtgagtgacaggctagtagctgagcgggctggcaaggc"
rna_adapter3 = "gacggggcggatgtctcgtctgagcgggcgtggcaaggc"
rapid_adapter = "agtcgtggaggcaaggcacacagggatagg"
ti_adapter_b = "ctgagactgccaaggcacacagggggatagg"