{- |
Module      :  ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
Description :  Import site profiles in Phylobayes format
Copyright   :  (c) Dominik Schrempf 2019
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Tue Jan 29 12:12:55 2019.

For now I just try to go with a huge empirical distribution mixture model. Let's
see if performance is good enough.

-}

module ELynx.Import.MarkovProcess.SiteprofilesPhylobayes
  ( Parser
  , EDMComponent
  , siteprofiles
  ) where

import           Control.Monad
import qualified Data.ByteString.Lazy.Char8        as L
import           Data.List                         (nub)
import qualified Data.Vector.Storable              as V
import           Data.Void
import           Text.Megaparsec
import           Text.Megaparsec.Byte
import           Text.Megaparsec.Byte.Lexer

import           ELynx.Data.MarkovProcess.EDMModel
import           ELynx.Tools.ByteString            (c2w)

-- | Shortcut.
type Parser = Parsec Void L.ByteString

-- | Parse stationary distributions from Phylobayes format.
siteprofiles :: Parser [EDMComponent]
siteprofiles = do
  _  <- headerLines
  cs <- many dataLine
  _  <- many newline *> eof
     <?> "phylobayes siteprofiles"
  let ls = map length cs
      nLs = length $ nub ls
  when (nLs /= 1) (error "The site profiles have a different number of entries.")
  return cs

horizontalSpace :: Parser ()
horizontalSpace = skipMany $ char (c2w ' ') <|> tab

line :: Parser ()
line = do
  _ <- many $ noneOf [c2w '\n']
  pure ()

-- For now, just ignore the header.
headerLines :: Parser ()
headerLines = do
  _ <- line
  _ <- many newline
    <?> "headerLine"
  pure ()

dataLine :: Parser EDMComponent
dataLine = do
  -- Ignore site number.
  _ <- decimal :: Parser Integer
  _ <- horizontalSpace
  -- Also ignore additional white space on line.
  vals <- float `sepEndBy1` horizontalSpace
  _ <- many newline
    <?> "dataLine"
  -- Set the weight to 1.0 for all sites.
  return (1.0, V.fromList vals)