{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE MultiParamTypeClasses #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Bio.Seq
(
Basic
, IUPAC
, Ext
, DNA
, RNA
, Peptide
, BioSeq'(..)
, BioSeq(..)
, rc
, gcContent
, nucleotideFreq
) where
import qualified Data.ByteString.Char8 as B
import Data.Char8 (toUpper)
import qualified Data.HashMap.Strict as M
import qualified Data.HashSet as S
import Data.Proxy (Proxy (..))
import Prelude hiding (length)
data Basic
data IUPAC
data Ext
newtype DNA alphabet = DNA B.ByteString
newtype RNA alphabet = RNA B.ByteString
newtype Peptide alphabet = Peptide B.ByteString
instance Show (DNA a) where
show (DNA s) = B.unpack s
instance Semigroup (DNA a) where
(<>) (DNA x) (DNA y) = DNA (x <> y)
instance Monoid (DNA a) where
mempty = DNA B.empty
mconcat dnas = DNA . B.concat . map toBS $ dnas
class BioSeq' s where
toBS :: s a -> B.ByteString
unsafeFromBS :: B.ByteString -> s a
slice :: Int -> Int -> s a -> s a
length :: s a -> Int
length = B.length . toBS
{-# MINIMAL toBS, slice, unsafeFromBS #-}
instance BioSeq' DNA where
toBS (DNA s) = s
unsafeFromBS = DNA
slice i l (DNA s) = DNA . B.take l . B.drop i $ s
instance BioSeq' RNA where
toBS (RNA s) = s
unsafeFromBS = RNA
slice i l (RNA s) = RNA . B.take l . B.drop i $ s
instance BioSeq' Peptide where
toBS (Peptide s) = s
unsafeFromBS = Peptide
slice i l (Peptide s) = Peptide . B.take l . B.drop i $ s
class BioSeq' seq => BioSeq seq alphabet where
alphabet :: Proxy (seq alphabet) -> S.HashSet Char
fromBS :: B.ByteString -> Either String (seq alphabet)
fromBS input = case B.mapAccumL fun Nothing input of
(Nothing, r) -> Right $ unsafeFromBS r
(Just e, _) -> Left $ "Bio.Seq.fromBS: unknown character: " ++ [e]
where
fun (Just e) x = (Just e, x)
fun Nothing x = let x' = toUpper x
in if x' `S.member` alphabet (Proxy :: Proxy (seq alphabet))
then (Nothing, x')
else (Just x', x')
{-# MINIMAL alphabet #-}
instance BioSeq DNA Basic where
alphabet _ = S.fromList "ACGT"
instance BioSeq DNA IUPAC where
alphabet _ = S.fromList "ACGTNVHDBMKWSYR"
instance BioSeq DNA Ext where
alphabet _ = undefined
instance BioSeq RNA Basic where
alphabet _ = S.fromList "ACGU"
rc :: DNA alphabet -> DNA alphabet
rc (DNA s) = DNA . B.map f . B.reverse $ s
where
f x = case x of
'A' -> 'T'
'C' -> 'G'
'G' -> 'C'
'T' -> 'A'
_ -> x
gcContent :: DNA alphabet -> Double
gcContent = (\(a,b) -> a / fromIntegral b) . B.foldl' f (0.0,0::Int) . toBS
where
f (!x,!n) c =
let x' = case c of
'A' -> x
'C' -> x + 1
'G' -> x + 1
'T' -> x
'H' -> x + 0.25
'D' -> x + 0.25
'V' -> x + 0.75
'B' -> x + 0.75
'S' -> x + 1
'W' -> x
_ -> x + 0.5
in (x', n+1)
nucleotideFreq :: forall a . BioSeq DNA a => DNA a -> M.HashMap Char Int
nucleotideFreq dna = B.foldl' f m0 . toBS $ dna
where
m0 = M.fromList . zip (S.toList $ alphabet (Proxy :: Proxy (DNA a))) . repeat $ 0
f m x = M.adjust (+1) x m
{-# INLINE nucleotideFreq #-}