{- | Description : State space of the boundary mutation model Copyright : (c) Dominik Schrempf 2017 License : GPLv3 Maintainer : dominik.schrempf@gmail.com Stability : unstable Portability : non-portable (not tested) The boundary mutation model is a discrete-state, continuous-time Markov process that allows mutations only when the population is monomorphic. * Changelog -} module EvoMod.Data.BoundaryMutationModel.State ( -- * Types Allele , PopSize , AlleleCount , State(..) , showCounts , nFixed -- * Functions , setPopSize , fromIndexWith , toIndex , stateSpace , stateSpaceSize -- , rmStateToBMState , neighbors ) where import Data.List import Control.Lens import Numeric.SpecFunctions (choose) import EvoMod.Data.Character import EvoMod.Data.Nucleotide -- import qualified RateMatrix as RM import EvoMod.Tools (allValues) -- | Alleles are just nucleotides at the moment. However, I want to keep the -- code such that it can be extended easily to codons or amino acids. type Allele = Nucleotide -- XXX: Change N to D and PopSize to Discretization. -- | The population size; has to be larger than one, otherwise there be dragons. type PopSize = Int -- | The absolute frequency of an allele. type AlleleCount = Int -- | The number of alleles. nAlleles :: Int nAlleles = 1 + fromEnum (maxBound :: Allele) -- | A boundary mutation model state is either a boundary state or a polymorphic -- state. This also automatically defines a total order. -- -- Another possibility would be: -- @ -- data State = Bnd Allele | Ply AlleleCount Allele Allele -- data StateComplete = StateComplete PopSize State -- @ -- But then, I think it is more important that the information is kept in one, -- at the cost of some overhead. data State = Bnd { bndN :: PopSize , bndA :: Allele } | Ply { plyN :: PopSize , plyI :: AlleleCount , plyA :: Allele , plyB :: Allele } deriving (Read, Eq) -- | String representation of 'State'; without surrounding brackets. showCounts :: State -> String showCounts (Bnd n a) = foldl1' (++) $ intersperse "," $ map toCounts allValues where toCounts b | a == b = show n | otherwise = "0" showCounts (Ply n i a b) = foldl1' (++) $ intersperse "," $ map toCounts allValues where toCounts c | c == a = show i | c == b = show (n-i) | otherwise = "0" instance Show State where show s = "(" ++ showCounts s ++ ")" -- | A total order on the boundary mutation model states. In general, Bnd < Ply. -- Then, sorting happens according to the order population size, first allele, -- second allele, allele count. It may be beneficial to reverse the allele count -- order (i.e., make a polymorphic state with higher allele count show up before -- a polymorphic state with lower allele count, this would move some polymorphic -- states closer to their respective boundaries), instance Ord State where Bnd {} <= Ply {} = True Ply {} <= Bnd {} = False s@(Bnd n a) <= t@(Bnd m b) | s == t = True | n /= m = n <= m | otherwise = a <= b s@(Ply n i a b) <= t@(Ply m j c d) | s == t = True | n /= m = n <= m | a < c = True | a > c = False -- We can be sure that a = c now. | b < d = True | b > d = False -- Now we can be sure that both nucleotides are the same. | otherwise = i <= j -- | Fixed population size when converting a 'State' to or from a number. In -- this case, a fixed population size is necessary so that @toEnum . fromEnum == -- id@. When converting from a number to 'State', the population size has to be -- given or assumed (see 'fromIndexWith') anyways. Especially when performing IO, -- the same number should always correspond to the same 'State' (bijection). -- 'nFixed' has been set such that the size of the state space is 256. nFixed :: Int nFixed = 43 -- | Set the population size of a 'State'; validity of resulting 'State' is checked. setPopSize :: PopSize -> State -> Maybe State setPopSize n s = if valid s' then Just s' else Nothing where s' = unsafeSetPopSize n s -- | See 'setPopSize'. Does not check if resulting 'State' is valid. unsafeSetPopSize :: Int -> State -> State unsafeSetPopSize n (Bnd _ s) = Bnd n s unsafeSetPopSize n (Ply _ i a b) = Ply n i a b -- | For a given population size 'PopSize', convert a number 'Int' to 'State'. fromIndexWith :: PopSize -> Int -> State fromIndexWith n i | i >= stateSpaceSize n = error $ "Index " ++ show i ++ "out of bounds when population size is " ++ show n ++ "." | i < nAlleles = Bnd n (toEnum i) | otherwise = Ply n (i' - p^._1 + 1) (p^._2) (p^._3) where i' = i - nAlleles l = [ (enumCombination a b * (n-1), a, b) | a <- [minBound .. pred maxBound] , b <- [ succ a ..]] p = last $ takeWhile (\e -> e^._1 <= i') l -- | Convert 'State' to a number 'Int' for the given population size 'PopSize'. -- Back conversion can be done with 'fromIndexWith', with the same population size. toIndex :: State -> Int toIndex (Bnd _ a) = fromEnum a -- We also have to shift the enumeration value by the number of boundary -- states, which is 'nAlleles'. toIndex (Ply n i a b) = nAlleles + enumCombination a b * (n-1) + i-1 -- | Enumeration only works when the population size is 'nFixed'. Only then, -- @toEnum . fromEnum == id@ can be guaranteed. This is because @toEnum :: -- State@ is only defined if the population size is known. See also -- 'fromIndexWith', and 'toIndex', as well as, 'setPopSize'. instance Enum State where fromEnum s = if getPopSize s /= nFixed then error $ "State is not enumerable: " ++ show s ++ "." else toIndex s toEnum = fromIndexWith nFixed -- The formula is a little complicated. Sketch of derivation: Order the states -- in the following way: -- @ -- AC CG GT -- AG CT -- AT -- @ -- The edge length of the triangle is @'nAlleles' - 1@. Use Gauss's triangle -- equation @area=binom(length+1, 2)@ twice to count the number of combinations -- up to a certain allele. E.g., up to, but excluding G: -- @ -- AC CG -- AG CT -- AT -- @ countCombinationsUpToAllele :: Allele -> Int countCombinationsUpToAllele a = round $ nAlleles `choose` 2 - (nAlleles - fromEnum a) `choose` 2 -- See 'countCombinationsUpToAllele'. The @-1@ pops up because we start counting -- from 0. For example, the enumeration value of @GT@ (with @fromEnum G = k = 2@ -- and @fromEnum T = 3@) is then @enumCombinationsUpToK 2 + (3-2)@. enumCombination :: Allele -> Allele -> Int enumCombination a b = countCombinationsUpToAllele a - 1 + (fromEnum b - fromEnum a) -- | A fixed population size 'nFixed' is assumed. instance Bounded State where minBound = Bnd nFixed minBound maxBound = Ply nFixed (nFixed-1) (pred maxBound) maxBound -- XXX: I am not sure if I should instantiate 'Character' because writing Fasta -- files with boundary mutation model states is not really promising anyways. -- However, the 'toIndex' and 'fromIndexWith' function provide a convenient way to -- map states to integers. This functionality is needed when working with -- matrices. -- | A fixed population size 'nFixed' is assumed. instance Character State where fromWord = toEnum . fromEnum toWord = toEnum . fromEnum valid :: State -> Bool valid (Bnd n _) | n <= 1 = False | otherwise = True valid (Ply n i a b) | n <= 1 = False | a >= b = False | i <= 0 = False | i >= n = False | otherwise = True filterValidStates :: [State] -> [State] filterValidStates = filter valid getPopSize :: State -> PopSize getPopSize (Bnd n _) = n getPopSize (Ply n _ _ _) = n -- CCC: This function is a not very efficient. A better would be something like: -- @ -- | Sorted list of all possible PoMo states for a specific population size. stateSpace :: PopSize -> [State] stateSpace n = map (fromIndexWith n) [0 .. stateSpaceSize n - 1] -- stateSpace n -- | n <= 1 = error "The population size has to be larger than one." -- | otherwise = sort $ filterValidStates ( allBndStates ++ allPlyStates ) -- where allBndStates = [ Bnd n a | -- a <- [minBound .. maxBound] :: [Allele] ] -- allPlyStates = [ Ply n i a b | -- i <- [0..n], -- a <- [minBound .. maxBound] :: [Allele], -- b <- [minBound .. maxBound] :: [Allele] ] -- | The state space of the boundary mutation model for four alleles and a -- population size N is 4 + 6*(N-1). stateSpaceSize :: PopSize -> Int stateSpaceSize n = k + k*(k-1) `div` 2 * (n-1) where k = nAlleles -- -- This is a very convenient version of toIndex, but can we always guarantee -- -- that the state space is sorted the same way? -- -- | Convert a boundary state to its ID (integer). See also 'idToState'. -- stateId :: State -> Maybe Int -- stateId s = elemIndex s (stateSpace $ getPopSize s) -- -- Same here. -- -- | Convert an ID to a boundary state. See also 'stateID'. -- idToState :: PopSize -> Int -> State -- idToState n i = stateSpace n !! i -- | Check if two states are connected. By definition, states are NOT connected -- with themselves. neighbors :: State -> State -> Bool neighbors s t = s `elem` getNeighbors t getNeighbors :: State -> [State] getNeighbors (Bnd n a) = filterValidStates allNeighbors where allNeighbors = [ Ply n (n-1) a b | b <- [minBound .. maxBound] :: [Allele] ] ++ [ Ply n 1 b a | b <- [minBound .. maxBound] :: [Allele] ] getNeighbors (Ply n i a b) -- Careful when the population size is two, because then each polymorphic -- states has two boundary states as neighbors. | i == 1 && n == 2 = Bnd n a : [Bnd n b] | i == 1 = Bnd n b : [Ply n 2 a b] | i == (n-1) = Bnd n a : [Ply n (n-2) a b] | otherwise = Ply n (i+1) a b : [Ply n (i-1) a b]