{-# LANGUAGE ScopedTypeVariables, BangPatterns, ParallelListComp  #-}
{-# OPTIONS_GHC -fwarn-incomplete-patterns #-}
{-# OPTIONS_GHC -fwarn-unused-imports #-}

module Bio.Phylogeny.PhyBin.Parser
       (newick_parser, parseNewick, parseNewicks, parseNewickFiles, unitTests)
       where
import           Control.Exception  (evaluate, handle, SomeException)
import qualified Data.ByteString.Char8 as B
import           Data.Char          (isSpace)
import           Data.Map as M
import           Data.List as L
import           Text.Parsec
import           Text.Parsec.ByteString
import           Test.HUnit          ((~:),(~=?),Test,test,assertFailure)
import           System.FilePath (takeBaseName)

import           Bio.Phylogeny.PhyBin.CoreTypes 
import           Prelude as P

type NameHack = (String->String)

-- | Parse a bytestring into a NewickTree with branch lengths.  The
--   first argument is file from which the data came and is just for
--   better error messages.
-- 
-- If the single bytestring contains more than one tree, then a number is appended to
-- the tree names.
parseNewick :: LabelTable -> NameHack -> String -> B.ByteString -> (LabelTable, [NewickTree DefDecor])
parseNewick tbl0 name_hack file input = (lbls,trs)
  where 
  (lbls,trs) = 
     extractLabelTable tbl0 $ 
     runB file (many1$ newick_parser name_hack) $
     B.filter (not . isSpace) input

-- treeFiles <- acquireTreeFiles files
                     -- let fn f = do raw <- B.readFile f
                     --               let ls = map (`B.append` (B.pack ";")) $ 
                     --                        B.splitWith (== ';') raw
                     --               return (map (f,) ls)
                     -- trees0 <- concat <$> mapM fn treeFiles
                     -- FIXME: no name_hack here:
                     -- let (lbls, trees) = parseNewicks id trees0 
                     -- putStrLn$ "Read trees! "++show (length trees)
                     -- putStrLn$ "Taxa: "++show (pPrint lbls)
                     -- putStrLn$ "First tree: "++show (displayDefaultTree (head trees))

-- | Parse a list of trees, starting with an empty map of labels and accumulating a final map.
parseNewickFiles :: NameHack -> [String] -> IO (LabelTable, [FullTree DefDecor])
parseNewickFiles fn nms = do
  -- WARNING: Lazy-IO here.  We need to be careful about leaking file descriptors.
  bss <- mapM B.readFile nms
  return $ parseNewicks fn (zip nms bss)

-- | A version which takes in-memory trees as ByteStrings.
parseNewicks :: NameHack -> [(String,B.ByteString)] -> (LabelTable, [FullTree DefDecor])
parseNewicks name_hack pairs = (labtbl, fullTrs)
 where
   fullTrs = [ FullTree (tweakName file ind) labtbl tr
             | (file,tr) <- trs
             | ind <- [(0::Int)..] ]
   tweakName file ind = -- Here we do the renaming/numbering business:
     if addSuffix
     then (takeBaseName file)++"_"++show ind 
     else (takeBaseName file) 
   addSuffix = case trs of
                 []  -> False -- Should this be an error?
                 [_] -> False
                 _   -> True
   (labtbl, trs) = P.foldr fn (M.empty,[]) pairs
   fn (file,bstr) (!acc,!ls) =
     let (acc',trs) = parseNewick acc name_hack file bstr
         names      = if length trs > 1
                      then zipWith (\x y -> x++"_"++show y) (repeat file) [0..]
                      else [file]
     in (acc', (zip names trs) ++ ls)

runB :: Show a => String -> Parser a -> B.ByteString -> a
runB file p input = case (parse p "" input) of
	         Left err -> error ("parse error in file "++ show file ++" at "++ show err)
		 Right x  -> x

extractLabelTable :: LabelTable -> [TempTree] -> (LabelTable, [NewickTree DefDecor])
extractLabelTable tbl0 trs = (finMap,finTree)
 where
   flipped = M.fromList $ L.map (\(x,y)->(y,x)) $ M.toList tbl0
   -- (_,finMap,finTree) = loop2 (S.fromList (M.elems tbl0)) tbl0 tr
   (_,finMap,finTree) = loop1 flipped tbl0 trs

   -- Mere plumbing:
   loop1 seen acc [] = (seen, acc, [])
   loop1 seen acc (hd:tl) =
     let (seen2,acc2,hd') = loop2 seen acc hd 
         (seen3,acc3,tl') = loop1 seen2 acc2 tl in
     (seen3,acc3, hd':tl')

   loop2 seen acc (NTLeaf (d,Just nm) _)
     | M.member nm seen = (seen, acc, NTLeaf d (seen M.! nm))
     | otherwise = let nxt = M.size acc in
                   (M.insert nm nxt seen,
                    M.insert nxt nm acc,  NTLeaf d nxt)
   loop2 seen1 acc1 (NTInterior (d,Nothing) chlds) =
     let (seen',acc',ls') = 
          P.foldr (\ x (seen2,acc2,ls) ->
                   let (seen3,acc3,x') = loop2 seen2 acc2 x in
                   (seen3, acc3, x':ls))
                  (seen1,acc1,[])
                  chlds
     in (seen',acc', NTInterior d ls')

----------------------------------------------------------------------------------------------------
-- Newick file format parser definitions:
----------------------------------------------------------------------------------------------------

-- | Hack: we store the names in the leaves.
type TempTree = NewickTree (DefDecor,Maybe String)

-- | This is used to post-facto splice metadata into the data structure.
tag :: DefDecor -> TempTree -> TempTree
tag l s =
  case s of 
    NTLeaf (_,m) n            -> NTLeaf (l,m) n
    NTInterior (_,Nothing) ls -> NTInterior (l,Nothing) ls

-- | This parser ASSUMES that whitespace has been prefiltered from the input.
newick_parser :: NameHack -> Parser TempTree
newick_parser name_hack = 
   do x <- subtree name_hack
      -- Get the top-level metadata:
      l <- branchMetadat name_hack
      _ <- char ';'
      return$ tag l x

subtree :: NameHack -> Parser TempTree
subtree name_hack = internal name_hack <|> leaf name_hack

defaultMeta :: (Maybe Int, Double)
defaultMeta = (Nothing,0.0)

leaf :: NameHack -> Parser TempTree
leaf name_hack =
  do nm <- name
     let nm' = name_hack nm
     return$ NTLeaf (defaultMeta,Just nm') 0 

internal :: NameHack -> Parser TempTree
internal name_hack =
   do _   <- char '('       
      bs  <- branchset name_hack
      _   <- char ')'       
      _nm <- name -- IGNORED, internal names.
      return$ NTInterior (defaultMeta,Nothing) bs

branchset :: NameHack -> Parser [TempTree]
branchset name_hack =
    do b <- branch name_hack <?> "at least one branch"
       rest <- option [] $ try$ do char ','; branchset name_hack
       return (b:rest)

branch :: NameHack -> Parser TempTree
branch name_hack =
         do s <- subtree name_hack
            l <- branchMetadat name_hack 
            return$ tag l s

-- If the length is omitted, it is implicitly zero.
branchMetadat :: NameHack -> Parser DefDecor 
branchMetadat name_hack = option defaultMeta $ do
    char ':'
    n <- (try sciNotation <|> number)
    -- IF the branch length succeeds then we go for the bracketed bootstrap value also:
    bootstrap <- option Nothing $ do
      char '['
      s <- many1 digit
      char ']'
      return (Just (read s))
    return (bootstrap,n)

-- | Parse a normal, decimal number.
number :: Parser Double
number = 
  do sign   <- option "" $ string "-"
     first  <- many1 digit
     second <- option "0" $ try$ do char '.'; many1 digit
     return (read (sign ++ first++"."++second) :: Double)

-- | Parse a number in scientific notation.
sciNotation :: Parser Double
sciNotation =
  do coeff <- do first <- many1 digit
                 second <- option "0" $ try$ do char '.'; many1 digit
                 return $ first++"."++second
     char 'e'
     sign  <- option "" $ string "-"
     expon <- many1 digit
     return (read (coeff++"e"++sign++expon))

-- | Names are a mess... they contain all kinds of garbage sometimes it seems.
--   Thus we are very permissive.  We allow anything that is not something we specifically need to reserve.
name :: Parser String
name = option "" $ many1 (noneOf "()[]:;, \t\n\r\f\v")
-- name = option "" $ many1 (letter <|> digit <|> oneOf "_.-")


--------------------------------------------------------------------------------
-- Unit Tests
--------------------------------------------------------------------------------

tre1 :: NewickTree DefDecor
tre1 = head $ snd $ parseNewick M.empty id "" $ B.pack "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"

unitTests :: Test
unitTests = test
   [ "test name"   ~: "foo" ~=?  run name "foo"
   , "test number" ~:  3.3  ~=?  run number "3.3"
   , "test number" ~:  3.0  ~=?  run number "3"
   , "test number" ~:  -3.0 ~=?  run number "-3"

   , "leaf"     ~: ntl "A" ~=?  run (leaf id)    "A"
   , "subtree"  ~: ntl "A" ~=?  run (subtree id) "A"

     -- These are not allowed:
   , "null branchset" ~: errortest$ run (branchset id) ""
     
   , "internal" ~: NTInterior nada [ntl "A"] ~=?  run (internal id) "(A);"
   , "example: no nodes are named"  ~: NTInterior nada
                                         [ntl "", ntl "", NTInterior nada [ntl "", ntl ""]]
        			   ~=? run (newick_parser id) "(,,(,));"
   , "example: leaf nodes are named" ~: 6 ~=?  treeSize (run (newick_parser id) "(A,B,(C,D));")
   , "example: all nodes are named"  ~: 6 ~=?  treeSize (run (newick_parser id) "(A,B,(C,D)E)F;")

   , "example: all but root node have a distance to parent"  ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5);")
   , "example: all have a distance to parent"              ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5):0.6;")
   , "example: distances and leaf names (popular)"         ~: 6 ~=? treeSize tre1
   , "example: distances and all names"                    ~: 6 ~=? treeSize (run (newick_parser id) "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;")
   , "example: a tree rooted on a leaf node (rare)"        ~: 6 ~=? treeSize (run (newick_parser id) "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;")
   ]
 where
  ntl s = NTLeaf ((Nothing,0.0),Just s) 0
  nada = ((Nothing,0),Nothing)

run :: Show a => Parser a -> String -> a
run p input = runB "<unknown>" p (B.pack input)

errortest :: t -> IO ()
errortest x = 
   --() ~=?
    handle (\ (e::SomeException) -> return ()) $ 
      do evaluate x
         assertFailure "test was expected to throw an error"