{- |
Module      :  ELynx.Import.Tree.Newick
Description :  Import Newick trees
Copyright   :  (c) Dominik Schrempf 2019
License     :  GPL-3

Maintainer  :  dominik.schrempf@gmail.com
Stability   :  unstable
Portability :  portable

Creation date: Thu Jan 17 14:56:27 2019.

Code partly taken from Biobase.Newick.Import.

[Specifications](http://evolution.genetics.washington.edu/phylip/newicktree.html)

- In particular, no conversion from _ to (space) is done right now.

-}


module ELynx.Import.Tree.Newick
  ( Parser
  -- * Newick tree format
  , newick
  , oneNewick
  , manyNewick
  , forest
  , leaf
  , node
  , name
  , branchLength
  -- * Newick tree format with branch support as node names (e.g., used by IQ-TREE)
  , newickIqTree
  , oneNewickIqTree
  , manyNewickIqTree
  ) where

import qualified Data.ByteString.Lazy       as L
import           Data.Tree
import           Data.Void
import           Data.Word
import           Text.Megaparsec
import           Text.Megaparsec.Byte
import           Text.Megaparsec.Byte.Lexer (decimal, float)

import           ELynx.Data.Tree.PhyloTree
import           ELynx.Tools.ByteString     (c2w)

-- | Shortcut.
type Parser = Parsec Void L.ByteString

-- | Parse a single Newick tree. Also succeeds when more trees follow.
newick :: Parser (Tree (PhyloLabel L.ByteString))
newick = tree <* char (c2w ';') <?> "newick"

-- | Parse a single Newick tree. Fails when end of file is not reached.
oneNewick :: Parser (Tree (PhyloLabel L.ByteString))
oneNewick = newick <* space <* eof <?> "oneNewick"

-- | Parse many Newick trees until end of file.
manyNewick :: Parser [Tree (PhyloLabel L.ByteString)]
manyNewick = some (newick <* space) <* eof <?> "manyNewick"

tree :: Parser (Tree (PhyloLabel L.ByteString))
tree = space *> (branched <|> leaf) <?> "tree"

branched :: Parser (Tree (PhyloLabel L.ByteString))
branched = do
  f <- forest
  n <- node
    <?> "branched"
  return $ Node n f

-- | A 'forest' is a set of trees separated by @,@ and enclosed by parentheses.
forest :: Parser [Tree (PhyloLabel L.ByteString)]
forest = do
  _ <- char (c2w '(')
  f <- tree `sepBy1` char (c2w ',')
  _ <- char (c2w ')')
    <?> "forest"
  return f

branchSupport :: Parser (Maybe Double)
branchSupport = optional $ do
  _ <- try $ char (c2w '[')
  s <- try float <|> try decimalAsDouble
  _ <- try $ char (c2w ']')
  return s

-- | A 'leaf' is a 'node' without children.
leaf :: Parser (Tree (PhyloLabel L.ByteString))
leaf = do
  n <- node
    <?> "leaf"
  return $ Node n []

-- | A 'node' has a name and a 'branchLength'.
node :: Parser (PhyloLabel L.ByteString)
node = do
  n <- name
  b <- branchLength
  s <- branchSupport
    <?> "node"
  return $ PhyloLabel n s b

checkNameCharacter :: Word8 -> Bool
checkNameCharacter c = c `notElem` map c2w " :;()[],"

-- | A name can be any string of printable characters except blanks, colons,
-- semicolons, parentheses, and square brackets (and commas).
name :: Parser L.ByteString
name = L.pack <$> many (satisfy checkNameCharacter) <?> "name"

-- | Branch length.
branchLength :: Parser (Maybe Double)
branchLength = (optional $ char (c2w ':') *> branchLengthGiven) <?> "branchLength"

branchLengthGiven :: Parser Double
branchLengthGiven = try float <|> decimalAsDouble

decimalAsDouble :: Parser Double
decimalAsDouble = fromIntegral <$> (decimal :: Parser Int)

--------------------------------------------------------------------------------
-- IQ-TREE STUFF.

-- | IQ-TREE stores the branch support as node names after the closing bracket of a forest.
newickIqTree :: Parser (Tree (PhyloLabel L.ByteString))
newickIqTree = treeIqTree <* char (c2w ';') <?> "newickIqTree"

-- | IQ-TREE stores the branch support as node names after the closing bracket of a forest.
oneNewickIqTree :: Parser (Tree (PhyloLabel L.ByteString))
oneNewickIqTree = newickIqTree <* space <* eof <?> "oneNewickIqTree"

-- | IQ-TREE stores the branch support as node names after the closing bracket of a forest.
manyNewickIqTree :: Parser [Tree (PhyloLabel L.ByteString)]
manyNewickIqTree = some (newickIqTree <* space) <* eof <?> "manyNewickIqTree"

-- IQ-TREE stores the branch support as node names after the closing bracket of a forest.
treeIqTree :: Parser (Tree (PhyloLabel L.ByteString))
treeIqTree = space *> (branchedIqTree <|> leaf) <?> "treeIqTree"

-- IQ-TREE stores the branch support as node names after the closing bracket of a forest.
forestIqTree :: Parser [Tree (PhyloLabel L.ByteString)]
forestIqTree = do
  _ <- char (c2w '(')
  f <- treeIqTree `sepBy1` char (c2w ',')
  _ <- char (c2w ')')
    <?> "forestIqTree"
  return f

-- IQ-TREE stores the branch support as node names after the closing bracket of a forest.
branchedIqTree :: Parser (Tree (PhyloLabel L.ByteString))
branchedIqTree = do
  f <- forestIqTree
  s <- branchSupportIqTree
  n <- nodeIqTree
    <?> "branchedIqTree"
  let n' = n {brSup = s}
  return $ Node n' f

-- IQ-TREE stores the branch support as node names after the closing bracket of a forest.
branchSupportIqTree :: Parser (Maybe Double)
branchSupportIqTree = optional $ try float <|> try decimalAsDouble

-- IQ-TREE stores the branch support as node names after the closing bracket of a forest.
nodeIqTree :: Parser (PhyloLabel L.ByteString)
nodeIqTree = do
  n <- name
  b <- branchLength
    <?> "nodeIqTree"
  return $ PhyloLabel n Nothing b