{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE DeriveDataTypeable #-}

-- | This program trains a parameter database for RNAwolf. The user has to take
-- care to only give appropriate training data to the optimizer. The most
-- important rule is to not give any pseudoknotted data. The small helper
-- program "MkTrainingData" should be able to take care of this.
-- "MkTrainingData" is part of BiobaseTrainingData.
--
-- We currently train using an optimization scheme described in:
--
-- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal
-- "Rich Parameterization Improves RNA Structure Prediction"
-- RECOMB 2011
--
-- NOTE It is likely that this we extended with other methods in the (near)
-- future, again. Especially the convex-optimization-based (even though the
-- Zakov et al. scheme is derived from cvx-methods) system seems promising.
-- Right now, this version simply is faster...
--
-- TODO update the DB within IO to save creation / destruction of Params in
-- each iteration
--
-- TODO re-allow co-folding

module Main where

import Control.Applicative
import Control.Arrow
import Control.Monad
import Control.Parallel (pseq)
import Control.Parallel.Strategies
import Data.Function (on)
import Data.List
import Data.List.Split (splitEvery)
import Data.Ord
import qualified Data.Map as M
import qualified Data.Vector.Unboxed as VU
import System.Console.CmdArgs
import System.Random
import Text.Printf

import Biobase.Primary
import Biobase.Secondary.Diagrams
import Biobase.TrainingData
import Biobase.TrainingData.Import
import Statistics.ConfusionMatrix
import Statistics.PerformanceMetrics

import BioInf.Keys
import BioInf.Params as P
import BioInf.Params.Export as P
import BioInf.Params.Import as P
import BioInf.PassiveAggressive
import BioInf.RNAwolf


-- | Entry function

main :: IO ()
main = do
  o@Options{..} <- cmdArgs options
  when (null outDB) $
    error "please set --outdb"
  when (null trainingData) $
    error "please give at least one training data file with --trainingdata"
  -- read training data
  xs <- id
      . fmap (filter (\TrainingData{..} ->
                       True
--                       length primary > 20 &&
                       && all (/='&') primary -- no co-folding right now
--                       length secondary > 5 -- at least 5 basepairs
                     )
             )
      . fmap (filter (lengthFlt maxLength))
      . fmap concat
      $ mapM fromFile trainingData
  -- read database or use zero-based parameters
  dbIn <- maybe (return . P.fromList . map (+0.01) . P.toList $ P.zeroParams) (fmap read . readFile) inDB
  -- dbOut <- foldM (foldTD $ length xs) dbIn $ zip xs [1..]
  (dbOut,_) <- foldM (doIteration o xs) (dbIn,[]) [1..numIterations]
  writeFile outDB $ show dbOut

-- | length filter for training data

lengthFlt l TrainingData{..} = maybe True (length primary <) l

-- | iterations to go

doIteration :: Options -> [TrainingData] -> (P.Params,[Double]) -> Int -> IO (P.Params,[Double])
doIteration o@Options{..} xs' (!p,rhos) !k = do
  xs <- fmap (splitEvery parallelism) $ shuffle xs'
  when (Iteration `elem` verbose) $ do
    putStrLn "\n======================================"
    printf "# INFO iteration: %4d / %4d starting\n"
            k
            numIterations
    printf "# INFO folding %d elements, maximal length: %d\n"
            (length xs')
            (maximum $ map (length . primary) xs')
    putStrLn "======================================\n"
  let indices = mapAccumL (\acc x -> (acc+x,(acc+1,acc+x))) 0 $ map length xs
  (newp,rs) <- foldM (foldTD o $ length xs) (p,[]) . zip xs . snd $ indices
  let drctch = sum $ zipWith (\x y -> abs $ x-y) (P.toList p) (P.toList newp)
  let rhosum = sum $ map accMeas rs
  let rho = rhosum / (sum . map genericLength $ xs)
  when (Iteration `elem` verbose) $ do
    putStrLn "\n======================================"
    printf "# INFO iteration: %4d / %4d ended, rho: %4.2f (%4.2f, %4.2f)\n"
            k
            numIterations
            rho
            (minimum $ map accMeas rs)
            (maximum $ map accMeas rs)
    putStr "# INFO rho history:"
    zipWithM_ (printf " %4d %4.2f") [1::Int ..] $ rhos++[rho]
    putStrLn ""
    putStrLn "======================================\n"
  writeFile (printf "%04d.db" k) . show $ newp
  return (newp,rhos++[rho])

-- | Fold one 'TrainingData' element and return the suggested changes and
-- additional information.

foldOne :: Options -> P.Params -> TrainingData -> (TrainingData,PA)
foldOne o@Options{..} p td
  | null bs   = (td   , PA [] 0 0 ["no prediction for: " ++ primary td])
  | otherwise = (fst worst, ret)
  where
    pri = mkPrimary $ primary td
    tables = rnaWolf p pri
    bs = let f x = td{predicted = x} in
         map (first f) . take (maybe 1 id maxLoss) $ rnaWolfBacktrack p pri 0.001 tables
    worst = minimumBy (comparing (fmeasure . mkConfusionMatrix . fst)) bs
    runPA (x,score) = defaultPA aggressiveness p
            $ x { comments =
                    [ show score
                    , simpleViewer (primary x) $ secondary x
                    , simpleViewer (primary x) $ predicted x
                    , show $ predicted x
                    ]
                }
    ret = runPA worst

-- | Folding of 'TrainingData' elements.

foldTD :: Options -> Int -> (P.Params,[PA]) -> ([TrainingData],(Int,Int)) -> IO (P.Params,[PA])
foldTD o@Options{..} total (!p,oldresults) (ts,(f,t)) = do
  -- At this point, we trade most efficient optimization with increased parallelism, if that option is >1
  let parfolds = map (foldOne o p) ts
  let !results = let xs = map snd parfolds in xs `using` (parList rdeepseq)
  let cs = concatMap changes results
  let cur = VU.fromList . P.toList $ p
  let new = P.fromList . VU.toList $ VU.accum (\v pm -> v+pm) cur cs
  let rhosum = sum $ map accMeas results
  let rho = rhosum / genericLength ts
  let rhosumR = sum . map accMeas $ oldresults ++ results
  let rhoR = rhosumR / genericLength (oldresults ++ results)
  when (Single `elem` verbose) $ do
    printf "# INFO parallel: %4d - %4d, avg.rho: %4.2f, running rho: %4.2f\n"
            f t
            rho
            rhoR
  -- detailed information on each folded structure
  mapM_ (printDetailed o . fst) parfolds
  return $ pseq (rdeepseq results)
         ( new
         , oldresults ++ results
         )

-- | simple viewer...

simpleViewer s xs = foldl f (replicate (length s) '.') xs where
  f str ((i,j),_) = upd ')' j $ upd '(' i str
  upd c k str
    |  l=='('
    && c=='('
    = pre ++ "<" ++ post
    |  l==')'
    && c==')'
    = pre ++ ">" ++ post
    | l/='.' = pre ++ "X" ++ post
    | otherwise = pre ++ [c] ++ post
    where
      pre = take k str
      l = head $ drop k str
      post = drop (k+1) str

-- | print out detailed information on a folded candidate

printDetailed :: Options -> TrainingData -> IO ()
printDetailed Options{..} x = do
  when (Detailed `elem` verbose) $ do
    putStrLn $ take (length $ primary x) . concatMap show . concat . repeat $ [0..9]
    putStrLn $ primary x
    putStrLn $ simpleViewer (primary x) $ secondary x
    putStrLn $ simpleViewer (primary x) $ predicted x
    when (AllPairs `elem` verbose) $ do
      mapM_ print $ predicted x


-- ** program options

data Options = Options
  { inDB :: Maybe FilePath
  , outDB :: FilePath
  , trainingData :: [FilePath]
  , maxLength :: Maybe Int
  , numIterations :: Int
  , verbose :: [Verbose]
  , maxLoss :: Maybe Int
  , aggressiveness :: Double
  , errorOnError :: Bool
  , parallelism :: Int
  } deriving (Show,Data,Typeable)

data Verbose
  = Iteration
  | Single
  | Detailed
  | AllPairs
  deriving (Show,Data,Typeable,Eq)

options = Options
  { inDB  = Nothing &= help "database from which to continue optimizing; if none is given, start from scratch"
  , outDB = ""      &= help "new database to write out"
  , trainingData = [] &= help "training data elements to read"
  , maxLength = Nothing &= help "[dev] only train using elements of length or less"
  , numIterations = 50 &= help "how many optimizer iterations"
  , verbose = [] &= help "select verbosity options: single, iteration, detailed (all switch on different verbosity options)"
  , maxLoss = Nothing &= help "use maxLoss optimization instead of prediction-based, requires maximal number of instances to search for maxLoss (default: not used)"
  , aggressiveness = 1 &= help "maximal tau for each round"
  , errorOnError = False &= help "error out if an error is detected (default: false)"
  , parallelism = 1 &= help "perform more than one prediction concurrently. Will probably reduce the effectiveness of the algorithm but allow to use more than one core; call with +RTS -N -RTS"
  }


-- ** helper functions

-- | simple shuffling of a list

shuffle :: [a] -> IO [a]
shuffle [] = return []
shuffle xs = do
  r <- getStdRandom (randomR (0,length xs -1))
  let (hs,ts) = splitAt r xs
  let y = head ts
  ys <- shuffle $ hs ++ tail ts
  return $ y : ys