{-# LANGUAGE CPP #-}
-- | Common data types used everywhere.  This module is a collection of
-- very basic "bioinformatics" data types that are simple, but don't
-- make sense to define over and over.

module Bio.Base(
    Nucleotide(..), Nucleotides(..),
    Qual(..), toQual, fromQual, fromQualRaised, probToQual,
    Prob'(..), Prob, toProb, fromProb, qualToProb, pow,

    Pair(..),
    Word8,
    nucA, nucC, nucG, nucT,
    nucsA, nucsC, nucsG, nucsT, nucsN, gap,
    toNucleotide, toNucleotides, nucToNucs,
    showNucleotide, showNucleotides,
    isGap,
    isBase,
    isProperBase,
    properBases,
    compl, compls,

    Position(..),
    shiftPosition,
    p_is_reverse,

    Range(..),
    shiftRange,
    reverseRange,
    extendRange,
    insideRange,
    wrapRange
) where

import BasePrelude
#if MIN_VERSION_base(4,9,0)
                             hiding ( log1pexp, log1mexp, (<>) )
#else
                             hiding ( (<>) )
#endif
import Bio.Util.Numeric             ( log1pexp, log1mexp )

import Data.ByteString.Internal     ( c2w )
import Data.Semigroup               ( Semigroup(..) )
import qualified Data.ByteString.Char8 as C
import qualified Data.Vector.Unboxed   as U

-- | A nucleotide base.  We only represent A,C,G,T.  The contained
-- 'Word8' ist guaranteed to be 0..3.
newtype Nucleotide = N { unN :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )

instance Bounded Nucleotide where
    minBound = N 0
    maxBound = N 3

-- | A nucleotide base in an alignment.
-- Experience says we're dealing with Ns and gaps all the type, so
-- purity be damned, they are included as if they were real bases.
--
-- To allow @Nucleotides@s to be unpacked and incorporated into
-- containers, we choose to represent them the same way as the BAM file
-- format:  as a 4 bit wide field.  Gaps are encoded as 0 where they
-- make sense, N is 15.  The contained 'Word8' is guaranteed to be
-- 0..15.

newtype Nucleotides = Ns { unNs :: Word8 } deriving ( Eq, Ord, Enum, Ix, Storable )

instance Bounded Nucleotides where
    minBound = Ns  0
    maxBound = Ns 15

instance Semigroup Nucleotides where Ns a <> Ns b = Ns (a .|. b)
instance Monoid Nucleotides where mempty = Ns 0 ; mappend = (<>)

nucToNucs :: Nucleotide -> Nucleotides
nucToNucs (N x) = Ns $ 1 `shiftL` fromIntegral (x .&. 3)

-- | Qualities are stored in deciban, also known as the Phred scale.  To
-- represent a value @p@, we store @-10 * log_10 p@.  Operations work
-- directly on the \"Phred\" value, as the name suggests.  The same goes
-- for the 'Ord' instance:  greater quality means higher \"Phred\"
-- score, which means lower error probability.

newtype Qual = Q { unQ :: Word8 } deriving ( Eq, Ord, Storable, Bounded )

instance Show Qual where
    showsPrec p (Q q) = (:) 'q' . showsPrec p q

toQual :: (Floating a, RealFrac a) => a -> Qual
toQual a = Q $ round (-10 * log a / log 10)

fromQual :: Qual -> Double
fromQual (Q q) = 10 ** (- fromIntegral q / 10)

fromQualRaised :: Double -> Qual -> Double
fromQualRaised k (Q q) = 10 ** (- k * fromIntegral q / 10)

-- | A positive floating point value stored in log domain.  We store the
-- natural logarithm (makes computation easier), but allow conversions
-- to the familiar \"Phred\" scale used for 'Qual' values.
newtype Prob' a = Pr { unPr :: a } deriving ( Eq, Ord, Storable )

-- | Common way of using 'Prob''.
type Prob = Prob' Double

instance RealFloat a => Show (Prob' a) where
    showsPrec _ (Pr p) = (:) 'q' . showFFloat (Just 1) q
      where q = - 10 * p / log 10

instance (Floating a, Ord a) => Num (Prob' a) where
    {-# INLINE fromInteger #-}
    fromInteger a = Pr (log (fromInteger a))
    {-# INLINE (+) #-}
    Pr x + Pr y = Pr $ if x >= y then x + log1pexp (y-x) else y + log1pexp (x-y)
    {-# INLINE (-) #-}
    Pr x - Pr y = Pr $ if x >= y then x + log1mexp (y-x) else error "no negative error probabilities"
    {-# INLINE (*) #-}
    Pr a * Pr b = Pr $ a + b
    {-# INLINE negate #-}
    negate    _ = Pr $ error "no negative error probabilities"
    {-# INLINE abs #-}
    abs       x = x
    {-# INLINE signum #-}
    signum    _ = Pr 0

instance (Floating a, Fractional a, Ord a) => Fractional (Prob' a) where
    fromRational a = Pr (log (fromRational a))
    Pr a  /  Pr b = Pr (a - b)
    recip  (Pr a) = Pr (negate a)

infixr 8 `pow`
pow :: Num a => Prob' a -> a -> Prob' a
pow (Pr a) e = Pr $ a * e


toProb :: Floating a => a -> Prob' a
toProb p = Pr (log p)

fromProb :: Floating a => Prob' a -> a
fromProb (Pr q) = exp q

qualToProb :: Floating a => Qual -> Prob' a
qualToProb (Q q) = Pr (- log 10 * fromIntegral q / 10)

probToQual :: (Floating a, RealFrac a) => Prob' a -> Qual
probToQual (Pr p) = Q (round (- 10 * p / log 10))

nucA, nucC, nucG, nucT :: Nucleotide
nucA = N 0
nucC = N 1
nucG = N 2
nucT = N 3

gap, nucsA, nucsC, nucsG, nucsT, nucsN :: Nucleotides
gap   = Ns 0
nucsA = Ns 1
nucsC = Ns 2
nucsG = Ns 4
nucsT = Ns 8
nucsN = Ns 15


-- | Coordinates in a genome.
-- The position is zero-based, no questions about it.  Think of the
-- position as pointing to the crack between two bases: looking forward
-- you see the next base to the right, looking in the reverse direction
-- you see the complement of the first base to the left.
--
-- To encode the strand, we (virtually) reverse-complement any sequence
-- and prepend it to the normal one.  That way, reversed coordinates
-- have a negative sign and automatically make sense.  Position 0 could
-- either be the beginning of the sequence or the end on the reverse
-- strand... that ambiguity shouldn't really matter.

data Position = Pos {
        p_seq   :: {-# UNPACK #-} !C.ByteString,    -- ^ sequence (e.g. some chromosome)
        p_start :: {-# UNPACK #-} !Int              -- ^ offset, zero-based
    } deriving (Show, Eq, Ord)

p_is_reverse :: Position -> Bool
p_is_reverse = (< 0) . p_start

-- | Ranges in genomes
-- We combine a position with a length.  In 'Range pos len', 'pos' is
-- always the start of a stretch of length 'len'.  Positions therefore
-- move in the opposite direction on the reverse strand.  To get the
-- same stretch on the reverse strand, shift r_pos by r_length, then
-- reverse direction (or call reverseRange).
data Range = Range {
        r_pos    :: {-# UNPACK #-} !Position,
        r_length :: {-# UNPACK #-} !Int
    } deriving (Show, Eq, Ord)


-- | Converts a character into a 'Nucleotide'.
-- The codes for A,C,G understood, everything else is a T.  (Error
-- detection is the caller's responsibility.
toNucleotide :: Word8 -> Nucleotide
toNucleotide x | x .|. 32 == c2w 'a'  =  nucA
               | x .|. 32 == c2w 'c'  =  nucC
               | x .|. 32 == c2w 'g'  =  nucG
               | otherwise            =  nucT
{-# INLINABLE toNucleotide #-}

-- | Converts a character into a 'Nucleotides'.
-- The usual codes for A,C,G,T and U are understood along with the IUPAC
-- ambiguity codes, '-' and '.' become gaps and everything else is an N.
toNucleotides :: Word8 -> Nucleotides
toNucleotides x | x .|. 32 == c2w 'a'  =  nucsA
                | x .|. 32 == c2w 'c'  =  nucsC
                | x .|. 32 == c2w 'g'  =  nucsG
                | x .|. 32 == c2w 't'  =  nucsT
                | x .|. 32 == c2w 'n'  =  nucsN
                | x .|. 32 == c2w 'u'  =  nucsT
                | x .|. 32 == c2w 'b'  =  nucsC <> nucsG <> nucsT
                | x .|. 32 == c2w 'd'  =  nucsA <> nucsG <> nucsT
                | x .|. 32 == c2w 'h'  =  nucsA <> nucsC <> nucsT
                | x .|. 32 == c2w 'v'  =  nucsA <> nucsC <> nucsG
                | x .|. 32 == c2w 'w'  =  nucsA <> nucsT
                | x .|. 32 == c2w 's'  =  nucsC <> nucsG
                | x .|. 32 == c2w 'm'  =  nucsA <> nucsC
                | x .|. 32 == c2w 'k'  =  nucsG <> nucsT
                | x .|. 32 == c2w 'y'  =  nucsC <> nucsT
                | x .|. 32 == c2w 'r'  =  nucsA <> nucsG
                | otherwise            =  gap
{-# INLINABLE toNucleotides #-}

-- | Tests if a 'Nucleotides' is a base.
-- Returns 'True' for everything but gaps.
isBase :: Nucleotides -> Bool
isBase (Ns n) = n /= 0

-- | Tests if a 'Nucleotides' is a proper base.
-- Returns 'True' for A,C,G,T only.
isProperBase :: Nucleotides -> Bool
isProperBase x = x == nucsA || x == nucsC || x == nucsG || x == nucsT

properBases :: [ Nucleotides ]
properBases = [ nucsA, nucsC, nucsG, nucsT ]

-- | Tests if a 'Nucleotides' is a gap.
-- Returns true only for the gap.
isGap :: Nucleotides -> Bool
isGap x = x == gap


{-# INLINE showNucleotide #-}
showNucleotide :: Nucleotide -> Char
showNucleotide (N x) = C.index "ACGT" $ fromIntegral $ x .&. 3

{-# INLINE showNucleotides #-}
showNucleotides :: Nucleotides -> Char
showNucleotides (Ns x) = C.index  "-ACMGRSVTWYHKDBN" $ fromIntegral $ x .&. 15

instance Show Nucleotide where
    show     x = [ showNucleotide x ]
    showList l = (map showNucleotide l ++)

instance Read Nucleotide where
    readsPrec _ ('a':cs) = [(nucA, cs)]
    readsPrec _ ('A':cs) = [(nucA, cs)]
    readsPrec _ ('c':cs) = [(nucC, cs)]
    readsPrec _ ('C':cs) = [(nucC, cs)]
    readsPrec _ ('g':cs) = [(nucG, cs)]
    readsPrec _ ('G':cs) = [(nucG, cs)]
    readsPrec _ ('t':cs) = [(nucT, cs)]
    readsPrec _ ('T':cs) = [(nucT, cs)]
    readsPrec _ ('u':cs) = [(nucT, cs)]
    readsPrec _ ('U':cs) = [(nucT, cs)]
    readsPrec _     _    = [          ]

    readList ('-':cs) = readList cs
    readList (c:cs) | isSpace c = readList cs
                    | otherwise = case reads (c:cs) of
                            [] -> [ ([],c:cs) ]
                            xs -> [ (n:ns,r2) | (n,r1) <- xs, (ns,r2) <- readList r1 ]
    readList [] = [([],[])]

instance Show Nucleotides where
    show     x = [ showNucleotides x ]
    showList l = (map showNucleotides l ++)

instance Read Nucleotides where
    readsPrec _ (c:cs) = [(toNucleotides (c2w c), cs)]
    readsPrec _ [    ] = []
    readList s = let (hd,tl) = span (\c -> isAlpha c || isSpace c || '-' == c) s
                 in [(map (toNucleotides . c2w) $ filter (not . isSpace) hd, tl)]

-- | Complements a Nucleotides.
{-# INLINE compl #-}
compl :: Nucleotide -> Nucleotide
compl (N n) = N $ n `xor` 3

-- | Complements a Nucleotides.
{-# INLINE compls #-}
compls :: Nucleotides -> Nucleotides
compls (Ns x) = Ns $ ar `U.unsafeIndex` fromIntegral (x .&. 15)
  where
    !ar = U.fromListN 16 [ 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 ]


-- | Moves a @Position@.  The position is moved forward according to the
-- strand, negative indexes move backward accordingly.
shiftPosition :: Int -> Position -> Position
shiftPosition a p = p { p_start = p_start p + a }

-- | Moves a @Range@.  This is just @shiftPosition@ lifted.
shiftRange :: Int -> Range -> Range
shiftRange a r = r { r_pos = shiftPosition a (r_pos r) }

-- | Reverses a 'Range' to give the same @Range@ on the opposite strand.
reverseRange :: Range -> Range
reverseRange (Range (Pos sq pos) len) = Range (Pos sq (-pos-len)) len

-- | Extends a range.  The length of the range is simply increased.
extendRange :: Int -> Range -> Range
extendRange a r = r { r_length = r_length r + a }

-- | Expands a subrange.
-- @(range1 `insideRange` range2)@ interprets @range1@ as a subrange of
-- @range2@ and computes its absolute coordinates.  The sequence name of
-- @range1@ is ignored.
insideRange :: Range -> Range -> Range
insideRange r1@(Range (Pos _ start1) length1) r2@(Range (Pos sq start2) length2)
    | start2 < 0         = reverseRange (insideRange r1 (reverseRange r2))
    | start1 <= length2  = Range (Pos sq (start2 + start1)) (min length1 (length2 - start1))
    | otherwise          = Range (Pos sq (start2 + length2)) 0


-- | Wraps a range to a region.  This simply normalizes the start
-- position to be in the interval '[0,n)', which only makes sense if the
-- @Range@ is to be mapped onto a circular genome.  This works on both
-- strands and the strand information is retained.
wrapRange :: Int -> Range -> Range
wrapRange n (Range (Pos sq s) l) = Range (Pos sq (s `mod` n)) l


-- | A strict pair.
data Pair a b = !a :!: !b deriving(Eq, Ord, Show, Read, Bounded, Ix)
infixl 2 :!: