{-# LANGUAGE CPP #-}
-- | Common data types used everywhere. This module is a collection of
-- very basic "bioinformatics" data types that are simple, but don't
-- make sense to define over and over.
module Bio.Base(
Nucleotide(..), Nucleotides(..),
Qual(..), toQual, fromQual, fromQualRaised, probToQual,
Prob'(..), Prob, toProb, fromProb, qualToProb, pow,
Pair(..),
Word8,
nucA, nucC, nucG, nucT,
nucsA, nucsC, nucsG, nucsT, nucsN, gap,
toNucleotide, toNucleotides, nucToNucs,
showNucleotide, showNucleotides,
isGap,
isBase,
isProperBase,
properBases,
compl, compls,
Position(..),
shiftPosition,
p_is_reverse,
Range(..),
shiftRange,
reverseRange,
extendRange,
insideRange,
wrapRange
) where
import BasePrelude
#if MIN_VERSION_base(4,9,0)
hiding ( log1pexp, log1mexp )
#endif
import Bio.Util.Numeric ( log1pexp, log1mexp )
import qualified Data.ByteString.Char8 as C
import qualified Data.Vector.Unboxed as U
-- | A nucleotide base. We only represent A,C,G,T. The contained
-- 'Word8' ist guaranteed to be 0..3.
newtype Nucleotide = N { unN :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )
instance Bounded Nucleotide where
minBound = N 0
maxBound = N 3
-- | A nucleotide base in an alignment.
-- Experience says we're dealing with Ns and gaps all the type, so
-- purity be damned, they are included as if they were real bases.
--
-- To allow @Nucleotides@s to be unpacked and incorporated into
-- containers, we choose to represent them the same way as the BAM file
-- format: as a 4 bit wide field. Gaps are encoded as 0 where they
-- make sense, N is 15. The contained 'Word8' is guaranteed to be
-- 0..15.
newtype Nucleotides = Ns { unNs :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )
instance Bounded Nucleotides where
minBound = Ns 0
maxBound = Ns 15
nucToNucs :: Nucleotide -> Nucleotides
nucToNucs (N x) = Ns $ 1 `shiftL` fromIntegral (x .&. 3)
-- | Qualities are stored in deciban, also known as the Phred scale. To
-- represent a value @p@, we store @-10 * log_10 p@. Operations work
-- directly on the \"Phred\" value, as the name suggests. The same goes
-- for the 'Ord' instance: greater quality means higher \"Phred\"
-- score, meand lower error probability.
newtype Qual = Q { unQ :: Word8 } deriving ( Eq, Ord, Storable, Bounded )
instance Show Qual where
showsPrec p (Q q) = (:) 'q' . showsPrec p q
toQual :: (Floating a, RealFrac a) => a -> Qual
toQual a = Q $ round (-10 * log a / log 10)
fromQual :: Qual -> Double
fromQual (Q q) = 10 ** (- fromIntegral q / 10)
fromQualRaised :: Double -> Qual -> Double
fromQualRaised k (Q q) = 10 ** (- k * fromIntegral q / 10)
-- | A positive floating point value stored in log domain. We store the
-- natural logarithm (makes computation easier), but allow conversions
-- to the familiar \"Phred\" scale used for 'Qual' values.
newtype Prob' a = Pr { unPr :: a } deriving ( Eq, Ord, Storable )
-- | Common way of using 'Prob''.
type Prob = Prob' Double
instance RealFloat a => Show (Prob' a) where
showsPrec _ (Pr p) = (:) 'q' . showFFloat (Just 1) q
where q = - 10 * p / log 10
instance (Floating a, Ord a) => Num (Prob' a) where
{-# INLINE fromInteger #-}
fromInteger a = Pr (log (fromInteger a))
{-# INLINE (+) #-}
Pr x + Pr y = Pr $ if x >= y then x + log1pexp (y-x) else y + log1pexp (x-y)
{-# INLINE (-) #-}
Pr x - Pr y = Pr $ if x >= y then x + log1mexp (y-x) else error "no negative error probabilities"
{-# INLINE (*) #-}
Pr a * Pr b = Pr $ a + b
{-# INLINE negate #-}
negate _ = Pr $ error "no negative error probabilities"
{-# INLINE abs #-}
abs x = x
{-# INLINE signum #-}
signum _ = Pr 0
instance (Floating a, Fractional a, Ord a) => Fractional (Prob' a) where
fromRational a = Pr (log (fromRational a))
Pr a / Pr b = Pr (a - b)
recip (Pr a) = Pr (negate a)
infixr 8 `pow`
pow :: Num a => Prob' a -> a -> Prob' a
pow (Pr a) e = Pr $ a * e
toProb :: Floating a => a -> Prob' a
toProb p = Pr (log p)
fromProb :: Floating a => Prob' a -> a
fromProb (Pr q) = exp q
qualToProb :: Floating a => Qual -> Prob' a
qualToProb (Q q) = Pr (- log 10 * fromIntegral q / 10)
probToQual :: (Floating a, RealFrac a) => Prob' a -> Qual
probToQual (Pr p) = Q (round (- 10 * p / log 10))
nucA, nucC, nucG, nucT :: Nucleotide
nucA = N 0
nucC = N 1
nucG = N 2
nucT = N 3
gap, nucsA, nucsC, nucsG, nucsT, nucsN :: Nucleotides
gap = Ns 0
nucsA = Ns 1
nucsC = Ns 2
nucsG = Ns 4
nucsT = Ns 8
nucsN = Ns 15
-- | Coordinates in a genome.
-- The position is zero-based, no questions about it. Think of the
-- position as pointing to the crack between two bases: looking forward
-- you see the next base to the right, looking in the reverse direction
-- you see the complement of the first base to the left.
--
-- To encode the strand, we (virtually) reverse-complement any sequence
-- and prepend it to the normal one. That way, reversed coordinates
-- have a negative sign and automatically make sense. Position 0 could
-- either be the beginning of the sequence or the end on the reverse
-- strand... that ambiguity shouldn't really matter.
data Position = Pos {
p_seq :: {-# UNPACK #-} !C.ByteString, -- ^ sequence (e.g. some chromosome)
p_start :: {-# UNPACK #-} !Int -- ^ offset, zero-based
} deriving (Show, Eq, Ord)
p_is_reverse :: Position -> Bool
p_is_reverse = (< 0) . p_start
-- | Ranges in genomes
-- We combine a position with a length. In 'Range pos len', 'pos' is
-- always the start of a stretch of length 'len'. Positions therefore
-- move in the opposite direction on the reverse strand. To get the
-- same stretch on the reverse strand, shift r_pos by r_length, then
-- reverse direction (or call reverseRange).
data Range = Range {
r_pos :: {-# UNPACK #-} !Position,
r_length :: {-# UNPACK #-} !Int
} deriving (Show, Eq, Ord)
-- | Converts a character into a 'Nucleotides'.
-- The usual codes for A,C,G,T and U are understood, '-' and '.' become
-- gaps and everything else is an N.
toNucleotide :: Char -> Nucleotide
toNucleotide c = if ord c < 128 then N (ar `U.unsafeIndex` ord c) else N 0
where
ar = U.replicate 128 0 U.//
( [ (ord x, n) | (x, N n) <- pairs ] ++
[ (ord (toUpper x), n) | (x, N n) <- pairs ] )
pairs = [ ('a', nucA), ('c', nucC), ('g', nucG), ('t', nucT) ]
-- | Converts a character into a 'Nucleotides'.
-- The usual codes for A,C,G,T and U are understood, '-' and '.' become
-- gaps and everything else is an N.
toNucleotides :: Char -> Nucleotides
toNucleotides c = if ord c < 128 then Ns (ar `U.unsafeIndex` ord c) else nucsN
where
ar = U.replicate 128 (unNs nucsN) U.//
( [ (ord x, n) | (x, Ns n) <- pairs ] ++
[ (ord (toUpper x), n) | (x, Ns n) <- pairs ] )
Ns a `plus` Ns b = Ns (a .|. b)
pairs = [ ('a', nucsA), ('c', nucsC), ('g', nucsG), ('t', nucsT),
('u', nucsT), ('-', gap), ('.', gap),
('b', nucsC `plus` nucsG `plus` nucsT),
('d', nucsA `plus` nucsG `plus` nucsT),
('h', nucsA `plus` nucsC `plus` nucsT),
('v', nucsA `plus` nucsC `plus` nucsG),
('k', nucsG `plus` nucsT),
('m', nucsA `plus` nucsC),
('s', nucsC `plus` nucsG),
('w', nucsA `plus` nucsT),
('r', nucsA `plus` nucsG),
('y', nucsC `plus` nucsT) ]
-- | Tests if a 'Nucleotides' is a base.
-- Returns 'True' for everything but gaps.
isBase :: Nucleotides -> Bool
isBase (Ns n) = n /= 0
-- | Tests if a 'Nucleotides' is a proper base.
-- Returns 'True' for A,C,G,T only.
isProperBase :: Nucleotides -> Bool
isProperBase x = x == nucsA || x == nucsC || x == nucsG || x == nucsT
properBases :: [ Nucleotides ]
properBases = [ nucsA, nucsC, nucsG, nucsT ]
-- | Tests if a 'Nucleotides' is a gap.
-- Returns true only for the gap.
isGap :: Nucleotides -> Bool
isGap x = x == gap
{-# INLINE showNucleotide #-}
showNucleotide :: Nucleotide -> Char
showNucleotide (N x) = C.index "ACGT" $ fromIntegral $ x .&. 3
{-# INLINE showNucleotides #-}
showNucleotides :: Nucleotides -> Char
showNucleotides (Ns x) = C.index "-ACMGRSVTWYHKDBN" $ fromIntegral $ x .&. 15
instance Show Nucleotide where
show x = [ showNucleotide x ]
showList l = (map showNucleotide l ++)
instance Read Nucleotide where
readsPrec _ ('a':cs) = [(nucA, cs)]
readsPrec _ ('A':cs) = [(nucA, cs)]
readsPrec _ ('c':cs) = [(nucC, cs)]
readsPrec _ ('C':cs) = [(nucC, cs)]
readsPrec _ ('g':cs) = [(nucG, cs)]
readsPrec _ ('G':cs) = [(nucG, cs)]
readsPrec _ ('t':cs) = [(nucT, cs)]
readsPrec _ ('T':cs) = [(nucT, cs)]
readsPrec _ ('u':cs) = [(nucT, cs)]
readsPrec _ ('U':cs) = [(nucT, cs)]
readsPrec _ _ = [ ]
readList ('-':cs) = readList cs
readList (c:cs) | isSpace c = readList cs
| otherwise = case reads (c:cs) of
[] -> [ ([],c:cs) ]
xs -> [ (n:ns,r2) | (n,r1) <- xs, (ns,r2) <- readList r1 ]
readList [] = [([],[])]
instance Show Nucleotides where
show x = [ showNucleotides x ]
showList l = (map showNucleotides l ++)
instance Read Nucleotides where
readsPrec _ (c:cs) = [(toNucleotides c, cs)]
readsPrec _ [ ] = []
readList s = let (hd,tl) = span (\c -> isAlpha c || isSpace c || '-' == c) s
in [(map toNucleotides $ filter (not . isSpace) hd, tl)]
-- | Complements a Nucleotides.
{-# INLINE compl #-}
compl :: Nucleotide -> Nucleotide
compl (N n) = N $ n `xor` 3
-- | Complements a Nucleotides.
{-# INLINE compls #-}
compls :: Nucleotides -> Nucleotides
compls (Ns x) = Ns $ ar `U.unsafeIndex` fromIntegral (x .&. 15)
where
!ar = U.fromListN 16 [ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 ]
-- | Moves a @Position@. The position is moved forward according to the
-- strand, negative indexes move backward accordingly.
shiftPosition :: Int -> Position -> Position
shiftPosition a p = p { p_start = p_start p + a }
-- | Moves a @Range@. This is just @shiftPosition@ lifted.
shiftRange :: Int -> Range -> Range
shiftRange a r = r { r_pos = shiftPosition a (r_pos r) }
-- | Reverses a 'Range' to give the same @Range@ on the opposite strand.
reverseRange :: Range -> Range
reverseRange (Range (Pos sq pos) len) = Range (Pos sq (-pos-len)) len
-- | Extends a range. The length of the range is simply increased.
extendRange :: Int -> Range -> Range
extendRange a r = r { r_length = r_length r + a }
-- | Expands a subrange.
-- @(range1 `insideRange` range2)@ interprets @range1@ as a subrange of
-- @range2@ and computes its absolute coordinates. The sequence name of
-- @range1@ is ignored.
insideRange :: Range -> Range -> Range
insideRange r1@(Range (Pos _ start1) length1) r2@(Range (Pos sq start2) length2)
| start2 < 0 = reverseRange (insideRange r1 (reverseRange r2))
| start1 <= length2 = Range (Pos sq (start2 + start1)) (min length1 (length2 - start1))
| otherwise = Range (Pos sq (start2 + length2)) 0
-- | Wraps a range to a region. This simply normalizes the start
-- position to be in the interval '[0,n)', which only makes sense if the
-- @Range@ is to be mapped onto a circular genome. This works on both
-- strands and the strand information is retained.
wrapRange :: Int -> Range -> Range
wrapRange n (Range (Pos sq s) l) = Range (Pos sq (s `mod` n)) l
-- | A strict pair.
data Pair a b = !a :!: !b deriving(Eq, Ord, Show, Read, Bounded, Ix)
infixl 2 :!: