{-# LANGUAGE BangPatterns #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE DeriveDataTypeable #-} -- | This program trains a parameter database for RNAwolf. The user has to take -- care to only give appropriate training data to the optimizer. The most -- important rule is to not give any pseudoknotted data. The small helper -- program "MkTrainingData" should be able to take care of this. -- "MkTrainingData" is part of BiobaseTrainingData. -- -- We currently train using an optimization scheme described in: -- -- Zakov, Shay and Goldberg, Yoav and Elhaded, Michael and Ziv-Ukelson, Michal -- "Rich Parameterization Improves RNA Structure Prediction" -- RECOMB 2011 -- -- NOTE It is likely that this we extended with other methods in the (near) -- future, again. Especially the convex-optimization-based (even though the -- Zakov et al. scheme is derived from cvx-methods) system seems promising. -- Right now, this version simply is faster... -- -- TODO update the DB within IO to save creation / destruction of Params in -- each iteration -- -- TODO re-allow co-folding module Main where import Control.Applicative import Control.Arrow import Control.Monad import Control.Parallel (pseq) import Control.Parallel.Strategies import Data.Function (on) import Data.List import Data.List.Split (splitEvery) import Data.Ord import qualified Data.Map as M import qualified Data.Vector.Unboxed as VU import System.Console.CmdArgs import System.Random import Text.Printf import Biobase.Primary import Biobase.Secondary.Diagrams import Biobase.TrainingData import Biobase.TrainingData.Import import Statistics.ConfusionMatrix import Statistics.PerformanceMetrics import BioInf.Keys import BioInf.Params as P import BioInf.Params.Export as P import BioInf.Params.Import as P import BioInf.PassiveAggressive import BioInf.RNAwolf -- | Entry function main :: IO () main = do o@Options{..} <- cmdArgs options when (null outDB) $ error "please set --outdb" when (null trainingData) $ error "please give at least one training data file with --trainingdata" -- read training data xs <- id . fmap (filter (\TrainingData{..} -> True -- length primary > 20 && && all (/='&') primary -- no co-folding right now -- length secondary > 5 -- at least 5 basepairs ) ) . fmap (filter (lengthFlt maxLength)) . fmap concat $ mapM fromFile trainingData -- read database or use zero-based parameters dbIn <- maybe (return . P.fromList . map (+0.01) . P.toList $ P.zeroParams) (fmap read . readFile) inDB -- dbOut <- foldM (foldTD $ length xs) dbIn $ zip xs [1..] (dbOut,_) <- foldM (doIteration o xs) (dbIn,[]) [1..numIterations] writeFile outDB $ show dbOut -- | length filter for training data lengthFlt l TrainingData{..} = maybe True (length primary <) l -- | iterations to go doIteration :: Options -> [TrainingData] -> (P.Params,[Double]) -> Int -> IO (P.Params,[Double]) doIteration o@Options{..} xs' (!p,rhos) !k = do xs <- fmap (splitEvery parallelism) $ shuffle xs' when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d starting\n" k numIterations printf "# INFO folding %d elements, maximal length: %d\n" (length xs') (maximum $ map (length . primary) xs') putStrLn "======================================\n" let indices = mapAccumL (\acc x -> (acc+x,(acc+1,acc+x))) 0 $ map length xs (newp,rs) <- foldM (foldTD o $ length xs) (p,[]) . zip xs . snd $ indices let drctch = sum $ zipWith (\x y -> abs $ x-y) (P.toList p) (P.toList newp) let rhosum = sum $ map accMeas rs let rho = rhosum / (sum . map genericLength $ xs) when (Iteration `elem` verbose) $ do putStrLn "\n======================================" printf "# INFO iteration: %4d / %4d ended, rho: %4.2f (%4.2f, %4.2f)\n" k numIterations rho (minimum $ map accMeas rs) (maximum $ map accMeas rs) putStr "# INFO rho history:" zipWithM_ (printf " %4d %4.2f") [1::Int ..] $ rhos++[rho] putStrLn "" putStrLn "======================================\n" writeFile (printf "%04d.db" k) . show $ newp return (newp,rhos++[rho]) -- | Fold one 'TrainingData' element and return the suggested changes and -- additional information. foldOne :: Options -> P.Params -> TrainingData -> (TrainingData,PA) foldOne o@Options{..} p td | null bs = (td , PA [] 0 0 ["no prediction for: " ++ primary td]) | otherwise = (fst worst, ret) where pri = mkPrimary $ primary td tables = rnaWolf p pri bs = let f x = td{predicted = x} in map (first f) . take (maybe 1 id maxLoss) $ rnaWolfBacktrack p pri 0.001 tables worst = minimumBy (comparing (fmeasure . mkConfusionMatrix . fst)) bs runPA (x,score) = defaultPA aggressiveness p $ x { comments = [ show score , simpleViewer (primary x) $ secondary x , simpleViewer (primary x) $ predicted x , show $ predicted x ] } ret = runPA worst -- | Folding of 'TrainingData' elements. foldTD :: Options -> Int -> (P.Params,[PA]) -> ([TrainingData],(Int,Int)) -> IO (P.Params,[PA]) foldTD o@Options{..} total (!p,oldresults) (ts,(f,t)) = do -- At this point, we trade most efficient optimization with increased parallelism, if that option is >1 let parfolds = map (foldOne o p) ts let !results = let xs = map snd parfolds in xs `using` (parList rdeepseq) let cs = concatMap changes results let cur = VU.fromList . P.toList $ p let new = P.fromList . VU.toList $ VU.accum (\v pm -> v+pm) cur cs let rhosum = sum $ map accMeas results let rho = rhosum / genericLength ts let rhosumR = sum . map accMeas $ oldresults ++ results let rhoR = rhosumR / genericLength (oldresults ++ results) when (Single `elem` verbose) $ do printf "# INFO parallel: %4d - %4d, avg.rho: %4.2f, running rho: %4.2f\n" f t rho rhoR -- detailed information on each folded structure mapM_ (printDetailed o . fst) parfolds return $ pseq (rdeepseq results) ( new , oldresults ++ results ) -- | simple viewer... simpleViewer s xs = foldl f (replicate (length s) '.') xs where f str ((i,j),_) = upd ')' j $ upd '(' i str upd c k str | l=='(' && c=='(' = pre ++ "<" ++ post | l==')' && c==')' = pre ++ ">" ++ post | l/='.' = pre ++ "X" ++ post | otherwise = pre ++ [c] ++ post where pre = take k str l = head $ drop k str post = drop (k+1) str -- | print out detailed information on a folded candidate printDetailed :: Options -> TrainingData -> IO () printDetailed Options{..} x = do when (Detailed `elem` verbose) $ do putStrLn $ take (length $ primary x) . concatMap show . concat . repeat $ [0..9] putStrLn $ primary x putStrLn $ simpleViewer (primary x) $ secondary x putStrLn $ simpleViewer (primary x) $ predicted x when (AllPairs `elem` verbose) $ do mapM_ print $ predicted x -- ** program options data Options = Options { inDB :: Maybe FilePath , outDB :: FilePath , trainingData :: [FilePath] , maxLength :: Maybe Int , numIterations :: Int , verbose :: [Verbose] , maxLoss :: Maybe Int , aggressiveness :: Double , errorOnError :: Bool , parallelism :: Int } deriving (Show,Data,Typeable) data Verbose = Iteration | Single | Detailed | AllPairs deriving (Show,Data,Typeable,Eq) options = Options { inDB = Nothing &= help "database from which to continue optimizing; if none is given, start from scratch" , outDB = "" &= help "new database to write out" , trainingData = [] &= help "training data elements to read" , maxLength = Nothing &= help "[dev] only train using elements of length or less" , numIterations = 50 &= help "how many optimizer iterations" , verbose = [] &= help "select verbosity options: single, iteration, detailed (all switch on different verbosity options)" , maxLoss = Nothing &= help "use maxLoss optimization instead of prediction-based, requires maximal number of instances to search for maxLoss (default: not used)" , aggressiveness = 1 &= help "maximal tau for each round" , errorOnError = False &= help "error out if an error is detected (default: false)" , parallelism = 1 &= help "perform more than one prediction concurrently. Will probably reduce the effectiveness of the algorithm but allow to use more than one core; call with +RTS -N -RTS" } -- ** helper functions -- | simple shuffling of a list shuffle :: [a] -> IO [a] shuffle [] = return [] shuffle xs = do r <- getStdRandom (randomR (0,length xs -1)) let (hs,ts) = splitAt r xs let y = head ts ys <- shuffle $ hs ++ tail ts return $ y : ys