{-# LANGUAGE CPP #-}
module Bio.Base(
Nucleotide(..), Nucleotides(..),
Qual(..), toQual, fromQual, fromQualRaised, probToQual,
Prob'(..), Prob, toProb, fromProb, qualToProb, pow,
Pair(..),
Word8,
nucA, nucC, nucG, nucT,
nucsA, nucsC, nucsG, nucsT, nucsN, gap,
toNucleotide, toNucleotides, nucToNucs,
showNucleotide, showNucleotides,
isGap,
isBase,
isProperBase,
properBases,
compl, compls,
Position(..),
shiftPosition,
p_is_reverse,
Range(..),
shiftRange,
reverseRange,
extendRange,
insideRange,
wrapRange
) where
import BasePrelude
#if MIN_VERSION_base(4,9,0)
hiding ( log1pexp, log1mexp, (<>) )
#else
hiding ( (<>) )
#endif
import Bio.Util.Numeric ( log1pexp, log1mexp )
import Data.ByteString.Internal ( c2w )
import Data.Semigroup ( Semigroup(..) )
import qualified Data.ByteString.Char8 as C
import qualified Data.Vector.Unboxed as U
newtype Nucleotide = N { unN :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )
instance Bounded Nucleotide where
minBound = N 0
maxBound = N 3
newtype Nucleotides = Ns { unNs :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )
instance Bounded Nucleotides where
minBound = Ns 0
maxBound = Ns 15
instance Semigroup Nucleotides where Ns a <> Ns b = Ns (a .|. b)
instance Monoid Nucleotides where mempty = Ns 0 ; mappend = (<>)
nucToNucs :: Nucleotide -> Nucleotides
nucToNucs (N x) = Ns $ 1 `shiftL` fromIntegral (x .&. 3)
newtype Qual = Q { unQ :: Word8 } deriving ( Eq, Ord, Storable, Bounded )
instance Show Qual where
showsPrec p (Q q) = (:) 'q' . showsPrec p q
toQual :: (Floating a, RealFrac a) => a -> Qual
toQual a = Q $ round (-10 * log a / log 10)
fromQual :: Qual -> Double
fromQual (Q q) = 10 ** (- fromIntegral q / 10)
fromQualRaised :: Double -> Qual -> Double
fromQualRaised k (Q q) = 10 ** (- k * fromIntegral q / 10)
newtype Prob' a = Pr { unPr :: a } deriving ( Eq, Ord, Storable )
type Prob = Prob' Double
instance RealFloat a => Show (Prob' a) where
showsPrec _ (Pr p) = (:) 'q' . showFFloat (Just 1) q
where q = - 10 * p / log 10
instance (Floating a, Ord a) => Num (Prob' a) where
{-# INLINE fromInteger #-}
fromInteger a = Pr (log (fromInteger a))
{-# INLINE (+) #-}
Pr x + Pr y = Pr $ if x >= y then x + log1pexp (y-x) else y + log1pexp (x-y)
{-# INLINE (-) #-}
Pr x - Pr y = Pr $ if x >= y then x + log1mexp (y-x) else error "no negative error probabilities"
{-# INLINE (*) #-}
Pr a * Pr b = Pr $ a + b
{-# INLINE negate #-}
negate _ = Pr $ error "no negative error probabilities"
{-# INLINE abs #-}
abs x = x
{-# INLINE signum #-}
signum _ = Pr 0
instance (Floating a, Fractional a, Ord a) => Fractional (Prob' a) where
fromRational a = Pr (log (fromRational a))
Pr a / Pr b = Pr (a - b)
recip (Pr a) = Pr (negate a)
infixr 8 `pow`
pow :: Num a => Prob' a -> a -> Prob' a
pow (Pr a) e = Pr $ a * e
toProb :: Floating a => a -> Prob' a
toProb p = Pr (log p)
fromProb :: Floating a => Prob' a -> a
fromProb (Pr q) = exp q
qualToProb :: Floating a => Qual -> Prob' a
qualToProb (Q q) = Pr (- log 10 * fromIntegral q / 10)
probToQual :: (Floating a, RealFrac a) => Prob' a -> Qual
probToQual (Pr p) = Q (round (- 10 * p / log 10))
nucA, nucC, nucG, nucT :: Nucleotide
nucA = N 0
nucC = N 1
nucG = N 2
nucT = N 3
gap, nucsA, nucsC, nucsG, nucsT, nucsN :: Nucleotides
gap = Ns 0
nucsA = Ns 1
nucsC = Ns 2
nucsG = Ns 4
nucsT = Ns 8
nucsN = Ns 15
data Position = Pos {
p_seq :: {-# UNPACK #-} !C.ByteString,
p_start :: {-# UNPACK #-} !Int
} deriving (Show, Eq, Ord)
p_is_reverse :: Position -> Bool
p_is_reverse = (< 0) . p_start
data Range = Range {
r_pos :: {-# UNPACK #-} !Position,
r_length :: {-# UNPACK #-} !Int
} deriving (Show, Eq, Ord)
toNucleotide :: Word8 -> Nucleotide
toNucleotide x | x .|. 32 == c2w 'a' = nucA
| x .|. 32 == c2w 'c' = nucC
| x .|. 32 == c2w 'g' = nucG
| otherwise = nucT
{-# INLINABLE toNucleotide #-}
toNucleotides :: Word8 -> Nucleotides
toNucleotides x | x .|. 32 == c2w 'a' = nucsA
| x .|. 32 == c2w 'c' = nucsC
| x .|. 32 == c2w 'g' = nucsG
| x .|. 32 == c2w 't' = nucsT
| x .|. 32 == c2w 'n' = nucsN
| x .|. 32 == c2w 'u' = nucsT
| x .|. 32 == c2w 'b' = nucsC <> nucsG <> nucsT
| x .|. 32 == c2w 'd' = nucsA <> nucsG <> nucsT
| x .|. 32 == c2w 'h' = nucsA <> nucsC <> nucsT
| x .|. 32 == c2w 'v' = nucsA <> nucsC <> nucsG
| x .|. 32 == c2w 'w' = nucsA <> nucsT
| x .|. 32 == c2w 's' = nucsC <> nucsG
| x .|. 32 == c2w 'm' = nucsA <> nucsC
| x .|. 32 == c2w 'k' = nucsG <> nucsT
| x .|. 32 == c2w 'y' = nucsC <> nucsT
| x .|. 32 == c2w 'r' = nucsA <> nucsG
| otherwise = gap
{-# INLINABLE toNucleotides #-}
isBase :: Nucleotides -> Bool
isBase (Ns n) = n /= 0
isProperBase :: Nucleotides -> Bool
isProperBase x = x == nucsA || x == nucsC || x == nucsG || x == nucsT
properBases :: [ Nucleotides ]
properBases = [ nucsA, nucsC, nucsG, nucsT ]
isGap :: Nucleotides -> Bool
isGap x = x == gap
{-# INLINE showNucleotide #-}
showNucleotide :: Nucleotide -> Char
showNucleotide (N x) = C.index "ACGT" $ fromIntegral $ x .&. 3
{-# INLINE showNucleotides #-}
showNucleotides :: Nucleotides -> Char
showNucleotides (Ns x) = C.index "-ACMGRSVTWYHKDBN" $ fromIntegral $ x .&. 15
instance Show Nucleotide where
show x = [ showNucleotide x ]
showList l = (map showNucleotide l ++)
instance Read Nucleotide where
readsPrec _ ('a':cs) = [(nucA, cs)]
readsPrec _ ('A':cs) = [(nucA, cs)]
readsPrec _ ('c':cs) = [(nucC, cs)]
readsPrec _ ('C':cs) = [(nucC, cs)]
readsPrec _ ('g':cs) = [(nucG, cs)]
readsPrec _ ('G':cs) = [(nucG, cs)]
readsPrec _ ('t':cs) = [(nucT, cs)]
readsPrec _ ('T':cs) = [(nucT, cs)]
readsPrec _ ('u':cs) = [(nucT, cs)]
readsPrec _ ('U':cs) = [(nucT, cs)]
readsPrec _ _ = [ ]
readList ('-':cs) = readList cs
readList (c:cs) | isSpace c = readList cs
| otherwise = case reads (c:cs) of
[] -> [ ([],c:cs) ]
xs -> [ (n:ns,r2) | (n,r1) <- xs, (ns,r2) <- readList r1 ]
readList [] = [([],[])]
instance Show Nucleotides where
show x = [ showNucleotides x ]
showList l = (map showNucleotides l ++)
instance Read Nucleotides where
readsPrec _ (c:cs) = [(toNucleotides (c2w c), cs)]
readsPrec _ [ ] = []
readList s = let (hd,tl) = span (\c -> isAlpha c || isSpace c || '-' == c) s
in [(map (toNucleotides . c2w) $ filter (not . isSpace) hd, tl)]
{-# INLINE compl #-}
compl :: Nucleotide -> Nucleotide
compl (N n) = N $ n `xor` 3
{-# INLINE compls #-}
compls :: Nucleotides -> Nucleotides
compls (Ns x) = Ns $ ar `U.unsafeIndex` fromIntegral (x .&. 15)
where
!ar = U.fromListN 16 [ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 ]
shiftPosition :: Int -> Position -> Position
shiftPosition a p = p { p_start = p_start p + a }
shiftRange :: Int -> Range -> Range
shiftRange a r = r { r_pos = shiftPosition a (r_pos r) }
reverseRange :: Range -> Range
reverseRange (Range (Pos sq pos) len) = Range (Pos sq (-pos-len)) len
extendRange :: Int -> Range -> Range
extendRange a r = r { r_length = r_length r + a }
insideRange :: Range -> Range -> Range
insideRange r1@(Range (Pos _ start1) length1) r2@(Range (Pos sq start2) length2)
| start2 < 0 = reverseRange (insideRange r1 (reverseRange r2))
| start1 <= length2 = Range (Pos sq (start2 + start1)) (min length1 (length2 - start1))
| otherwise = Range (Pos sq (start2 + length2)) 0
wrapRange :: Int -> Range -> Range
wrapRange n (Range (Pos sq s) l) = Range (Pos sq (s `mod` n)) l
data Pair a b = !a :!: !b deriving(Eq, Ord, Show, Read, Bounded, Ix)
infixl 2 :!: