{- |
Description :  State space of the boundary mutation model
Copyright   :  (c) Dominik Schrempf 2017
License     :  GPLv3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  non-portable (not tested)

The boundary mutation model is a discrete-state, continuous-time Markov process
that allows mutations only when the population is monomorphic.

* Changelog

-}

module EvoMod.Data.BoundaryMutationModel.State
  ( -- * Types
    Allele
  , PopSize
  , AlleleCount
  , State(..)
  , showCounts
  , nFixed
    -- * Functions
  , setPopSize
  , fromIndexWith
  , toIndex
  , stateSpace
  , stateSpaceSize
  -- , rmStateToBMState
  , neighbors
  ) where

import           Data.List

import           Control.Lens

import           Numeric.SpecFunctions  (choose)

import           EvoMod.Data.Character
import           EvoMod.Data.Nucleotide
-- import qualified RateMatrix as RM
import           EvoMod.Tools           (allValues)

-- | Alleles are just nucleotides at the moment. However, I want to keep the
-- code such that it can be extended easily to codons or amino acids.
type Allele = Nucleotide
-- XXX: Change N to D and PopSize to Discretization.
-- | The population size; has to be larger than one, otherwise there be dragons.
type PopSize = Int
-- | The absolute frequency of an allele.
type AlleleCount = Int

-- | The number of alleles.
nAlleles :: Int
nAlleles = 1 + fromEnum (maxBound :: Allele)

-- | A boundary mutation model state is either a boundary state or a polymorphic
-- state. This also automatically defines a total order.
--
-- Another possibility would be:
-- @
--  data State = Bnd Allele | Ply AlleleCount Allele Allele
--  data StateComplete = StateComplete PopSize State
-- @
-- But then, I think it is more important that the information is kept in one,
-- at the cost of some overhead.
data State = Bnd { bndN :: PopSize
                 , bndA :: Allele }
           | Ply { plyN :: PopSize
                 , plyI :: AlleleCount
                 , plyA :: Allele
                 , plyB :: Allele }
           deriving (Read, Eq)

-- | String representation of 'State'; without surrounding brackets.
showCounts :: State -> String
showCounts (Bnd n a) = foldl1' (++) $ intersperse "," $ map toCounts allValues
  where toCounts b
          | a == b    = show n
          | otherwise = "0"
showCounts (Ply n i a b) = foldl1' (++) $ intersperse "," $ map toCounts allValues
  where toCounts c
          | c == a    = show i
          | c == b    = show (n-i)
          | otherwise = "0"

instance Show State where
  show s = "(" ++ showCounts s ++ ")"

-- | A total order on the boundary mutation model states. In general, Bnd < Ply.
-- Then, sorting happens according to the order population size, first allele,
-- second allele, allele count. It may be beneficial to reverse the allele count
-- order (i.e., make a polymorphic state with higher allele count show up before
-- a polymorphic state with lower allele count, this would move some polymorphic
-- states closer to their respective boundaries),
instance Ord State where
  Bnd {} <= Ply {}            = True
  Ply {} <= Bnd {}            = False
  s@(Bnd n a) <= t@(Bnd m b)
    | s == t                  = True
    | n /= m                  = n <= m
    | otherwise               = a <= b
  s@(Ply n i a b) <= t@(Ply m j c d)
    | s == t                  = True
    | n /= m                  = n <= m
    | a < c                   = True
    | a > c                   = False
    -- We can be sure that a  = c now.
    | b < d                   = True
    | b > d                   = False
    -- Now we can be sure that both nucleotides are the same.
    | otherwise               = i <= j

-- | Fixed population size when converting a 'State' to or from a number. In
-- this case, a fixed population size is necessary so that @toEnum . fromEnum ==
-- id@. When converting from a number to 'State', the population size has to be
-- given or assumed (see 'fromIndexWith') anyways. Especially when performing IO,
-- the same number should always correspond to the same 'State' (bijection).
-- 'nFixed' has been set such that the size of the state space is 256.
nFixed :: Int
nFixed = 43

-- | Set the population size of a 'State'; validity of resulting 'State' is checked.
setPopSize :: PopSize -> State -> Maybe State
setPopSize n s = if valid s' then Just s' else Nothing
  where s' = unsafeSetPopSize n s

-- | See 'setPopSize'. Does not check if resulting 'State' is valid.
unsafeSetPopSize :: Int -> State -> State
unsafeSetPopSize n (Bnd _ s)     = Bnd n s
unsafeSetPopSize n (Ply _ i a b) = Ply n i a b

-- | For a given population size 'PopSize', convert a number 'Int' to 'State'.
fromIndexWith :: PopSize -> Int -> State
fromIndexWith n i
  | i >= stateSpaceSize n = error $
    "Index " ++ show i ++ "out of bounds when population size is " ++ show n ++ "."
  | i < nAlleles = Bnd n (toEnum i)
  | otherwise = Ply n (i' - p^._1 + 1) (p^._2) (p^._3)
  where i' = i - nAlleles
        l = [ (enumCombination a b * (n-1), a, b)
            | a <- [minBound .. pred maxBound]
            , b <- [ succ a ..]]
        p = last $ takeWhile (\e -> e^._1 <= i') l

-- | Convert 'State' to a number 'Int' for the given population size 'PopSize'.
-- Back conversion can be done with 'fromIndexWith', with the same population size.
toIndex :: State -> Int
toIndex (Bnd _ a) = fromEnum a
-- We also have to shift the enumeration value by the number of boundary
-- states, which is 'nAlleles'.
toIndex (Ply n i a b) = nAlleles + enumCombination a b * (n-1) + i-1

-- | Enumeration only works when the population size is 'nFixed'. Only then,
-- @toEnum . fromEnum == id@ can be guaranteed. This is because @toEnum ::
-- State@ is only defined if the population size is known. See also
-- 'fromIndexWith', and 'toIndex', as well as, 'setPopSize'.
instance Enum State where
  fromEnum s = if getPopSize s /= nFixed
    then error $ "State is not enumerable: " ++ show s ++ "."
    else toIndex s
  toEnum = fromIndexWith nFixed

-- The formula is a little complicated. Sketch of derivation: Order the states
-- in the following way:
-- @
--  AC CG GT
--  AG CT
--  AT
-- @
-- The edge length of the triangle is @'nAlleles' - 1@. Use Gauss's triangle
-- equation @area=binom(length+1, 2)@ twice to count the number of combinations
-- up to a certain allele. E.g., up to, but excluding G:
-- @
--  AC CG
--  AG CT
--  AT
-- @
countCombinationsUpToAllele :: Allele -> Int
countCombinationsUpToAllele a = round $ nAlleles `choose` 2 - (nAlleles - fromEnum a) `choose` 2

-- See 'countCombinationsUpToAllele'. The @-1@ pops up because we start counting
-- from 0. For example, the enumeration value of @GT@ (with @fromEnum G = k = 2@
-- and @fromEnum T = 3@) is then @enumCombinationsUpToK 2 + (3-2)@.
enumCombination :: Allele -> Allele -> Int
enumCombination a b = countCombinationsUpToAllele a - 1 + (fromEnum b - fromEnum a)

-- | A fixed population size 'nFixed' is assumed.
instance Bounded State where
  minBound = Bnd nFixed minBound
  maxBound = Ply nFixed (nFixed-1) (pred maxBound) maxBound

-- XXX: I am not sure if I should instantiate 'Character' because writing Fasta
-- files with boundary mutation model states is not really promising anyways.
-- However, the 'toIndex' and 'fromIndexWith' function provide a convenient way to
-- map states to integers. This functionality is needed when working with
-- matrices.
-- | A fixed population size 'nFixed' is assumed.
instance Character State where
  fromWord = toEnum . fromEnum
  toWord = toEnum . fromEnum

valid :: State -> Bool
valid (Bnd n _)
  | n <= 1    = False
  | otherwise = True
valid (Ply n i a b)
  | n <= 1    = False
  | a >= b    = False
  | i <= 0    = False
  | i >= n    = False
  | otherwise = True

filterValidStates :: [State] -> [State]
filterValidStates = filter valid

getPopSize :: State -> PopSize
getPopSize (Bnd n _)     = n
getPopSize (Ply n _ _ _) = n

-- CCC: This function is a not very efficient. A better would be something like:
-- @
-- | Sorted list of all possible PoMo states for a specific population size.
stateSpace :: PopSize -> [State]
stateSpace n = map (fromIndexWith n) [0 .. stateSpaceSize n - 1]
-- stateSpace n
--   | n <= 1    = error "The population size has to be larger than one."
--   | otherwise = sort $ filterValidStates ( allBndStates ++ allPlyStates )
--   where allBndStates = [ Bnd n a |
--                          a <- [minBound .. maxBound] :: [Allele] ]
--         allPlyStates = [ Ply n i a b |
--                          i <- [0..n],
--                          a <- [minBound .. maxBound] :: [Allele],
--                          b <- [minBound .. maxBound] :: [Allele] ]

-- | The state space of the boundary mutation model for four alleles and a
-- population size N is 4 + 6*(N-1).
stateSpaceSize :: PopSize -> Int
stateSpaceSize n = k + k*(k-1) `div` 2 * (n-1)
  where k = nAlleles


-- -- This is a very convenient version of toIndex, but can we always guarantee
-- -- that the state space is sorted the same way?
-- -- | Convert a boundary state to its ID (integer). See also 'idToState'.
-- stateId :: State -> Maybe Int
-- stateId s = elemIndex s (stateSpace $ getPopSize s)

-- -- Same here.
-- -- | Convert an ID to a boundary state. See also 'stateID'.
-- idToState :: PopSize -> Int -> State
-- idToState n i = stateSpace n !! i

-- | Check if two states are connected. By definition, states are NOT connected
-- with themselves.
neighbors :: State -> State -> Bool
neighbors s t = s `elem` getNeighbors t

getNeighbors :: State -> [State]
getNeighbors (Bnd n a) = filterValidStates allNeighbors
  where allNeighbors = [ Ply n (n-1) a b |
                         b <- [minBound .. maxBound] :: [Allele] ]
                       ++
                       [ Ply n 1 b a |
                         b <- [minBound .. maxBound] :: [Allele] ]
getNeighbors (Ply n i a b)
  -- Careful when the population size is two, because then each polymorphic
  -- states has two boundary states as neighbors.
  | i == 1 && n == 2  = Bnd n a : [Bnd n b]
  | i == 1            = Bnd n b : [Ply n 2 a b]
  | i == (n-1)        = Bnd n a : [Ply n (n-2) a b]
  | otherwise         = Ply n (i+1) a b : [Ply n (i-1) a b]