{-# LANGUAGE RecordWildCards, TupleSections #-}
{-# OPTIONS_GHC -fwarn-unused-imports -fwarn-incomplete-patterns #-}

module Main where
import           Data.List (sort)
import qualified Data.ByteString.Lazy.Char8 as B
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.IntSet as IS
import           Control.Monad
import           Control.Concurrent    (Chan, readChan, ThreadId, forkIO)
import           System.Environment    (getArgs, withArgs)
import           System.Console.GetOpt (OptDescr(Option), ArgDescr(..), ArgOrder(..), usageInfo, getOpt)
import           System.Exit           (exitSuccess)
import           System.IO             (stdout) 
import           Test.HUnit            (runTestTT, Test, test, (~:))

import Control.Applicative ((<$>))
import Data.GraphViz (runGraphvizCanvas,GraphvizCommand(Dot),GraphvizCanvas(Xlib))
import Text.PrettyPrint.HughesPJClass hiding (char, Style)

import Bio.Phylogeny.PhyBin.CoreTypes          
import Bio.Phylogeny.PhyBin           (driver, binthem, normalize, annotateWLabLists,
                                       unitTests, acquireTreeFiles, deAnnotate)
import Bio.Phylogeny.PhyBin.Parser    (parseNewick, parseNewicks, parseNewickFiles, unitTests)
import Bio.Phylogeny.PhyBin.Visualize (viewNewickTree, dotNewickTree_debug)
import Bio.Phylogeny.PhyBin.RFDistance (distanceMatrix, printDistMat, allBips)
import Bio.Phylogeny.PhyBin.PreProcessor

import qualified Data.Clustering.Hierarchical as C


import Version

----------------------------------------------------------------------------------------------------
-- MAIN script: Read command line options and call the program.
----------------------------------------------------------------------------------------------------

-- Note: ORDER is important here, we process options in this order:
data Flag 
    = Verbose  
    | Version 
    | Output String
    | NumTaxa Int
    | BranchThresh Double      
    | NullOpt
    | Graph | Draw
    | Force 
    | View
    | TabDelimited Int Int

    | SelfTest
    | RFMatrix | LineSetDiffMode | PrintNorms | PrintReg
    | Cluster C.Linkage
    | BinningMode
    | EditDistThresh Int
    | DendogramOnly

    | NameCutoff String
    | NamePrefix Int
    | NameTable String  -- Must come after Cutoff/Prefix

  deriving (Show, Eq, Ord) -- ORD is really used.


parseTabDelim :: String -> Flag
parseTabDelim _str = 
  TabDelimited 8 9
    
options :: [OptDescr Flag]
options =
     [ Option ['v']     ["verbose"] (NoArg Verbose)    "print WARNINGS and other information (recommended at first)"
     , Option ['V']     ["version"] (NoArg Version)    "show version number"

     , Option ['o']     ["output"]  (ReqArg Output "DIR")  "set directory to contain all output files (default \"./phybin_out/\")"
     , Option []     ["selftest"]   (NoArg SelfTest)   "run internal unit tests"
       
{- -- TODO: FIXME: IMPLEMENT THIS:
     , Option []        []          (NoArg NullOpt)  ""
     , Option ['t']     ["tabbed"]  (ReqArg parseTabDelim "NUM1:NUM2")$  "assume the input is a tab-delimited file with gene names \n"++
		                                                        "in column NUM1 and Newick trees in NUM2"
-}       
     , Option []        []          (NoArg NullOpt)  ""
     , Option []        []  (NoArg$ error "internal problem")  "----------------------------- Clustering Options ------------------------------"

     , Option []    ["bin"]      (NoArg BinningMode)$  "Use simple binning, the cheapest form of 'clustering'"
     , Option []    ["single"]   (NoArg$ Cluster C.SingleLinkage)  $  "Use single-linkage clustering (nearest neighbor)"
     , Option []    ["complete"] (NoArg$ Cluster C.CompleteLinkage)$  "Use complete-linkage clustering (furthest neighbor)"
     , Option []    ["UPGMA"]    (NoArg$ Cluster C.UPGMA)          $  "Use Unweighted Pair Group Method (average linkage)"

     , Option []    ["editdist"]  (ReqArg (EditDistThresh . read) "DIST")$
                                  "Combine all clusters separated by DIST or less.  Report a flat list of clusters."
     , Option []    ["dendogram"] (NoArg DendogramOnly)$ "Report a hierarchical clustering (default)"
       
     , Option []        []          (NoArg NullOpt)  ""
     , Option []        []  (NoArg$ error "internal problem")  "----------------------------- Visualization --------------------------------"
     , Option ['g']     ["graphbins"] (NoArg Graph)  "use graphviz to produce .dot and .pdf output files named cluster1.*, cluster2.*, etc"
-- TODO: Produce the consensus tree as well as the individual trees.
     , Option ['d']     ["drawbins"]  (NoArg Draw)   "like -g, but open GUI windows to show a tree for each bin"

     , Option ['w']     ["view"]    (NoArg View)$  "for convenience, \"view mode\" simply displays input Newick files without binning" 

     , Option []        []          (NoArg NullOpt)  ""
     , Option []        []  (NoArg$ error "internal problem")  "---------------------------- Tree pre-processing -----------------------------"

     , Option ['n']     ["numtaxa"] (ReqArg (NumTaxa . read) "NUM") "expect NUM taxa for this dataset"

     , Option ['b']     ["branchcut"] (ReqArg (BranchThresh . read) "LEN") "collapse branches less than LEN"
              
     , Option []        []          (NoArg NullOpt)  ""
     , Option []        []  (NoArg$ error "internal problem")  "--------------------------- Extracting taxa names ----------------------------"
--     , Option ['n']     ["numtaxa"] (ReqArg (NumTaxa . read) "NUM") "expect NUM taxa for this dataset (otherwise it will guess)"
       --  TODO, FIXME: The "guessing" part doesn't actually work yet -- implement it!!
       --  What's a good algorithm?  Insist they all have the same number?  Take the mode?
       
{- -- TODO: FIXME: IMPLEMENT THIS:
     , Option ['f']     ["force"]   (NoArg Force)    "force phybin to consume and bin trees with different numbers of taxa"
-}
     , Option []        []          (NoArg NullOpt)  ""
     , Option ['p']     ["nameprefix"]  (ReqArg (NamePrefix . read) "NUM") $ 
		  "Leaf names in the input Newick trees can be gene names, not taxa.\n"++
    	    	  "Then it is typical to extract taxa names from genes.  This option extracts\n"++
                  "a prefix of NUM characters to serve as the taxa name."

     , Option []        []          (NoArg NullOpt)  ""
     , Option ['s']     ["namesep"]   (ReqArg NameCutoff "STR") $  --"\n"++
		  "An alternative to --nameprefix, STR provides a set of delimeter characters,\n"++
                  "for example '-' or '0123456789'.  The taxa name is then a variable-length\n"++
		  "prefix of each gene name up to but not including any character in STR."

     , Option []        []          (NoArg NullOpt)  ""
     , Option ['m']     ["namemap"]  (ReqArg NameTable "FILE") $ 
		  "Even once prefixes are extracted it may be necessary to use a lookup table\n"++
		  "to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa.\n"++
		  "This option specifies a text file with find/replace entries of the form\n"++
		  "\"<string> <taxaname>\", which are applied AFTER -s and -p."
                  
     , Option []        []          (NoArg NullOpt)  ""
     , Option []        []  (NoArg$ error "internal problem")  "--------------------------- Utility Modes ----------------------------"
     , Option [] ["rfdist"]  (NoArg RFMatrix)        "print a Robinson Foulds distance matrix for the input trees"
     , Option [] ["setdiff"] (NoArg LineSetDiffMode) "for convenience, print the set difference between cluster*.txt files"
     , Option [] ["print"]      (NoArg PrintReg)     "simply print out a concise form of each input tree"       
     , Option [] ["printnorms"] (NoArg PrintNorms)   "simply print out a concise and NORMALIZED form of each input tree"
     ]

usage :: String
usage = "\nUsage: phybin [OPTION...] files or directories...\n\n"++

--        "MODE must be one of 'bin' or 'cluster'.\n\n" ++

        "PhyBin takes Newick tree files as input.  Paths of Newick files can\n"++
        "be passed directly on the command line.  Or, if directories are provided,\n"++
        "all files in those directories will be read.  Taxa are named based on the files\n"++
        "containing them.  If a file contains multiple trees, all are read by phybin, and\n"++  
        "the taxa name then includes a suffix indicating the position in the file:\n"++
        " e.g. FILENAME_0, FILENAME_1, etc.\n"++
        "\n"++          

        -- "In binning mode, Phybin output contains, at minimum, files of the form binXX_YY.tr,\n"++
        -- "each containing a list of input file paths that fall into that bin.  XX signifies\n"++
        -- "the rank of the bin (biggest first), and YY is how many trees it contains.\n"++
        -- "\n"++        

        "When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix.\n"++
        "If a threshold distance (tree edit distance) is given, then a flat set of clusters\n"++
        "will be produced in files clusterXX_YY.tr.  Otherwise it produces a full dendogram (UNFINISHED).\n"++
        "\n"++  

        "Binning mode provides an especially quick-and-dirty form of clustering.\n"++
        "When running with the --bin option, only exactly equal trees are put in the same cluster.\n"++
        "Tree pre-processing still applies, however: for example collapsing short branches.\n"++
        "\n"++        

	"USAGE NOTES: \n"++
        " * Currently phybin ignores input trees with the wrong number of taxa.\n"++
	" * If given a directory as input phybin will assume all contained files are Newick trees.\n\n"++

	"\nOptions include:\n"

defaultErr :: [String] -> t
defaultErr errs = error $ "ERROR!\n" ++ (concat errs ++ usageInfo usage options)

--------------------------------------------------------------------------------
-- Aggregated Unit Tests
--------------------------------------------------------------------------------

allUnitTests :: Test
-- allUnitTests = unitTests ++
allUnitTests = test 
  [ Bio.Phylogeny.PhyBin.unitTests
  , Bio.Phylogeny.PhyBin.Parser.unitTests
  , "norm/Bip1" ~: (testNorm prob1)
  ]
--  Bio.Phylogeny.PhyBin.Parser.unitTests

--    "(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13)"
-- [2013.07.23]      
-- This was INCORRECTLY normalizing to:
--     ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_))))
prob1 = "(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);"

-- | Make sure that the normalized version of a tree yields the same bipartitions as
-- the unnormalized one.
testNorm :: String -> IO ()
testNorm str = do
  let (labs,parsed) = parseNewick M.empty id "test" (B.pack str)
      normed = normalize $ annotateWLabLists parsed
      bips1  = allBips parsed
      bips2  = allBips normed
      added   = S.difference bips2 bips1
      removed = S.difference bips1 bips2
      dispBips bip = show$
        map (map (labs M.!)) $ 
        map IS.toList$ S.toList bip
  unless (bips1 == bips2) $ do
    putStrLn$ "Normalized this: "++show (displayDefaultTree $ FullTree "" labs parsed)
    putStrLn$ "To this        : "++show (displayDefaultTree $ deAnnotate $ FullTree "" labs normed)
    error$ "Normalization added and removed these bipartitions, respectively:\n  "
           ++dispBips added ++"\n  "
           ++dispBips removed


--------------------------------------------------------------------------------

main :: IO ()
main = 
  do argv <- getArgs 

     (opts,files) <- 
       case getOpt Permute options argv of
	 (o,n,[]  ) -> return (o,n)
         (_,_,errs) -> defaultErr errs

     all_inputs <- acquireTreeFiles files
     let process_opt cfg opt = case opt of 
	   NullOpt -> return cfg
	   Verbose -> return cfg { verbose= True } 
	   Version -> do putStrLn$ "phybin version "++phybin_version; exitSuccess

	   SelfTest -> do _ <- runTestTT allUnitTests; exitSuccess

     	   RFMatrix -> return cfg { print_rfmatrix= True }

           LineSetDiffMode -> do
             bss <- mapM B.readFile files
             case map (S.fromList . B.lines) bss of
               [set1,set2] -> do let [f1,f2] = files
                                 let diff = S.difference set1 set2
                                 putStrLn$" Found "++show(S.size diff)++" lines occuring in "++f1++" but not "++f2
                                 mapM_ B.putStrLn $ S.toList diff
               oth -> error $"Line set difference mode expects two files as input, got "++show(length oth)
             exitSuccess

           PrintNorms -> return cfg
           PrintReg   -> return cfg           
     
           Cluster lnk -> return cfg { clust_mode = ClusterThem lnk }
           BinningMode -> return cfg { clust_mode = BinThem }
           EditDistThresh n -> return cfg { dist_thresh = Just n }
           DendogramOnly    -> return cfg { dist_thresh = Nothing }
     
	   Output s -> return cfg { output_dir= s }

	   NumTaxa n -> return cfg { num_taxa= n }
     	   BranchThresh n -> return cfg { branch_collapse_thresh= Just n }
	   Graph     -> return cfg { do_graph= True } 
	   Draw	     -> return cfg { do_draw = True } 
	   View      -> return cfg -- Handled below

	   TabDelimited _ _ -> error "tabbed option not handled yet"
	   Force            -> error "force option not handled yet"


	   NameCutoff str -> let chopper = takeWhile (not . flip S.member (S.fromList str)) 
			     in return cfg { name_hack = chopper . name_hack cfg }
	   NamePrefix n   -> return cfg { name_hack = (take n) . name_hack cfg }

           -- This should always be after cutoff/prefix:
	   NameTable file -> do reader <- name_table_reader file
				return cfg { name_hack = reader . name_hack cfg }

     config <- foldM process_opt default_phybin_config{ inputs= all_inputs } 
	             (sort opts) -- NOTE: options processed in sorted order.

     when (null files) $ do
	defaultErr ["No file arguments!\n"]

     ------------------------------------------------------------
     -- This mode kicks in AFTER config options are processed.
     when (elem PrintReg opts) $ do 
       (_,fts) <- parseNewickFiles (name_hack config) all_inputs
       forM_ fts $ \ ft@(FullTree name _ _) -> do
         putStrLn $ "Tree "++show name
         putStrLn$ show$ displayDefaultTree ft
       exitSuccess
     ------------------------------------------------------------
     when (elem PrintNorms opts) $ do 
       (_,fts) <- parseNewickFiles (name_hack config) all_inputs
       forM_ fts $ \ ft@(FullTree name _ _) -> do
         putStrLn $ "Tree , NORMALIZED:"++show name
         putStrLn$ show$ displayDefaultTree$ deAnnotate $
           liftFT (normalize . annotateWLabLists) ft
       exitSuccess
     ------------------------------------------------------------
     when (View `elem` opts) $ do 
       view_graphs config
       exitSuccess
     ------------------------------------------------------------
     -- Otherwise do the main, normal thing:
     driver config

view_graphs :: PhyBinConfig -> IO ()
view_graphs PBC{..} = 
           do chans <- forM inputs $ \ file -> do 
                putStrLn$ "Drawing "++ file ++"...\n"
		str <- B.readFile file
		putStrLn$ "Parsed: " ++ (B.unpack str)
                let (tbl,tr) = parseNewick M.empty name_hack file str
 	        (chan, _tr) <- viewNewickTree file (FullTree "" tbl (annotateWLabLists tr))
	        return chan
	      forM_ chans readChan 
	      return ()


--------------------------------------------------------------------------------
-- Every dataset it seems needs a new hack on the names!

name_table_reader :: String -> IO (String -> String)
name_table_reader file = 
  do contents <- readFile file
     let mp = M.fromList $ 
	      map (\ls -> case ls of 
		           [a,b] -> (a,b)
		           _ -> error$ "Each line of "++file++" must contain two whitespace free strings: "++ unwords ls) $ 
	      filter (not . null) $
	      map tokenize $ 
	      lines contents
     return$ 
       \ name_to_hack -> 

       case M.lookup name_to_hack mp of -- Could use a trie
	     Just x -> x
	     Nothing -> name_to_hack
  where
    tokenize :: String -> [String]
    tokenize line =
      case words line of
        []        -> []
        [one]     -> error$"Name table contained bad line:  "++ show one
        [one,two] -> [one,two]
        (one:rest) -> [one, unwords rest]

temp :: IO ()
temp = driver default_phybin_config{ num_taxa=7, inputs=["../datasets/test.tr"] }


-- 112 and 13
rftest = do 
  (mp,[t1,t2]) <- parseNewickFiles (take 2) ["tests/13.tr", "tests/112.tr"]
  putStrLn$ "Tree 13           : " ++ show (displayDefaultTree t1)
  putStrLn$ "Tree 112          : "++ show (displayDefaultTree t2)

  putStrLn$ "Tree 13 normed    : "++ show (disp t1)
  putStrLn$ "Tree 112 normed   : "++ show (disp t2)

  putStrLn$ "13  collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t1)
  putStrLn$ "112 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t2)

  putStrLn$ "13  collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t1)
  putStrLn$ "112 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t2)  

  let mat = distanceMatrix [nwtree t1, nwtree t2]
  printDistMat stdout mat
  return ()
 where
  disp (FullTree nm labs tr) =
    let collapsed :: AnnotatedTree 
        collapsed = normalize$ annotateWLabLists tr
    in displayDefaultTree$ deAnnotate $  FullTree nm labs collapsed

----------------------------------------------------------------------------------------------------
-- TODO: expose a command line argument for testing.
-- The below test exposed my normalization bug relating to "deriving Ord".
-- I need to transform it into one or more proper unit tests.

main_test :: IO ()
main_test = 
 withArgs ["-w","~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd","-m","../datasets/yersinia/name_table_hack_yersinia.txt"]
	  main 

-- [2013.07.22] Disabling for the new Label representation:
{-
pa :: NewickTree DefDecor
pa = set_dec (Nothing,1) $ 
    NTInterior () [NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]],NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"]]

pb :: NewickTree DefDecor
pb = set_dec (Nothing,1) $ 
    NTInterior () [NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"],NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]]]

ls1 :: [(String, NewickTree DefDecor)]
ls1 = [("a",pa),("b",pb)]

-- This is one:
num_binned :: Int
num_binned = M.size $ binthem ls1

a_ :: (String, NewickTree DefDecor)
a_ = ("980.dnd",
      fmap (\x -> (Nothing,x)) $ 
      NTInterior 0.0 [NTInterior 5.697e-2 [NTLeaf 3.95e-2 "SM",NTLeaf 5.977e-2 "SD"],NTLeaf 0.13143 "RE",NTInterior 0.13899 [NTInterior 9.019e-2 [NTLeaf 0.11856 "BB",NTLeaf 0.13592 "BJ"],NTInterior 0.13194 [NTLeaf 0.19456 "MB",NTLeaf 0.16603 "ML"]]])

b_ :: (String, NewickTree DefDecor)
b_ = ("999.dnd",
      fmap (\x -> (Nothing,x)) $ 
      NTInterior 0.0 [NTInterior 6.527e-2 [NTInterior 0.13734 [NTLeaf 2.975e-2 "SM",NTLeaf 3.002e-2 "SD"],NTLeaf 0.18443 "RE"],NTInterior 6.621e-2 [NTLeaf 0.16184 "MB",NTLeaf 0.15233 "ML"],NTInterior 0.23143 [NTLeaf 9.192e-2 "BB",NTLeaf 0.10125 "BJ"]])

-- But THIS is two:  ack!
num2 :: Int
num2 = M.size $ binthem [a_,b_]

-- Here's the test that breaks things:
a_norm :: NewickTree StandardDecor
a_norm = normalize (annotateWLabLists$ snd a_)

b_norm_ :: NewickTree (Double, Int, [Label])
b_norm_ = NTInterior (0.0,7,["BB","BJ","MB","ML","RE","SD","SM"])
         [NTInterior (0.23143,2,["BB","BJ"]) [NTLeaf (9.192e-2,1,["BB"]) "BB",NTLeaf (0.10125,1,["BJ"]) "BJ"],NTInterior (6.621e-2,2,["MB","ML"]) [NTLeaf (0.16184,1,["MB"]) "MB",NTLeaf (0.15233,1,["ML"]) "ML"],NTInterior (6.527e-2,3,["RE","SD","SM"]) [NTLeaf (0.18443,1,["RE"]) "RE",NTInterior (0.13734,2,["SD","SM"]) [NTLeaf (3.002e-2,1,["SD"]) "SD",NTLeaf (2.975e-2,1,["SM"]) "SM"]]]

b_norm :: NewickTree StandardDecor
b_norm = fmap (\ (bl,w,ls) -> StandardDecor bl Nothing w ls) b_norm_

d1 :: IO (Chan (), NewickTree StandardDecor)
d1 = viewNewickTree "" a_norm

d2 :: IO (Chan (), NewickTree StandardDecor)
d2 = viewNewickTree "" b_norm

d1_ :: IO ThreadId
d1_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" a_norm) Xlib; return ()
                  
d2_ :: IO ThreadId
d2_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" b_norm) Xlib; return ()


-- | A of a tree with _____ weights attached to it:
withBootstrap :: String
withBootstrap = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819)74:0.00279881991324161267,(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899)15:0.00000122123005994819)4:0.00000122123005994819)56:0.00276851661606284868)60:0.00283144414216590342)76:0.00886304965525876697,(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939)64:0.00905997055810744446)28:0.00323255855543533657)29:0.02237505187863457132,(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819)100:0.28299884298270094884)100:0.05776841634437222123,(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819)99:0.00951976341375833368,Q68VM9_:0.04408933524904214141);"

withBootstrap2 :: String
withBootstrap2 = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819):0.00279881991324161267[74],(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899):0.00000122123005994819[15]):0.00000122123005994819[4]):0.00276851661606284868[56]):0.00283144414216590342[60]):0.00886304965525876697[76],(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939):0.00905997055810744446[64]):0.00323255855543533657[28]):0.02237505187863457132[29],(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819):0.28299884298270094884[100]):0.05776841634437222123[100],(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819):0.00951976341375833368[99],Q68VM9_:0.04408933524904214141);"
-}