{-# OPTIONS_GHC -fno-warn-orphans #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} module HackageBenchmark ( hackageBenchmarkMain -- Exposed for testing: , CabalResult(..) , isSignificantTimeDifference , combineTrialResults , isSignificantResult , shouldContinueAfterFirstTrial ) where import Control.Monad (forM_, replicateM, unless, when) import qualified Data.ByteString as B import Data.List (nub, unzip4) import Data.Maybe (isJust) import Data.Monoid ((<>)) import Data.String (fromString) import Data.Time (NominalDiffTime, diffUTCTime, getCurrentTime) import qualified Data.Vector.Unboxed as V import Options.Applicative import Statistics.Sample (mean, stdDev) import Statistics.Test.MannWhitneyU ( PositionTest(..), TestResult(..) , mannWhitneyUCriticalValue , mannWhitneyUtest) import Statistics.Types (PValue, mkPValue) import System.Exit (ExitCode(..), exitFailure) import System.IO ( BufferMode(LineBuffering), hPutStrLn, hSetBuffering, stderr , stdout) import System.Process ( StdStream(CreatePipe), CreateProcess(..), callProcess , createProcess, readProcess, shell, waitForProcess ) import Text.Printf (printf) import Distribution.Package (PackageName, mkPackageName, unPackageName) data Args = Args { argCabal1 :: FilePath , argCabal2 :: FilePath , argCabal1Flags :: [String] , argCabal2Flags :: [String] , argPackages :: [PackageName] , argMinRunTimeDifferenceToRerun :: Double , argPValue :: PValue Double , argTrials :: Int , argPrintTrials :: Bool , argPrintSkippedPackages :: Bool , argTimeoutSeconds :: Int } data CabalTrial = CabalTrial NominalDiffTime CabalResult data CabalResult = Solution | NoInstallPlan | BackjumpLimit | PkgNotFound | Timeout | Unknown deriving (Eq, Show) hackageBenchmarkMain :: IO () hackageBenchmarkMain = do hSetBuffering stdout LineBuffering args@Args {..} <- execParser parserInfo checkArgs args printConfig args pkgs <- getPackages args putStrLn "" let -- The maximum length of the heading and package names. nameColumnWidth :: Int nameColumnWidth = maximum $ map length $ "package" : map unPackageName pkgs runCabal1 = runCabal argTimeoutSeconds argCabal1 argCabal1Flags runCabal2 = runCabal argTimeoutSeconds argCabal2 argCabal2Flags -- When the output contains both trails and summaries, label each row as -- "trial" or "summary". when argPrintTrials $ putStr $ printf "%-16s " "trial/summary" putStrLn $ printf "%-*s %-13s %-13s %11s %11s %11s %11s %11s" nameColumnWidth "package" "result1" "result2" "mean1" "mean2" "stddev1" "stddev2" "speedup" forM_ pkgs $ \pkg -> do let printTrial msgType result1 result2 time1 time2 = putStrLn $ printf "%-16s %-*s %-13s %-13s %10.3fs %10.3fs" msgType nameColumnWidth (unPackageName pkg) (show result1) (show result2) (diffTimeToDouble time1) (diffTimeToDouble time2) CabalTrial t1 r1 <- runCabal1 pkg CabalTrial t2 r2 <- runCabal2 pkg if not $ shouldContinueAfterFirstTrial argMinRunTimeDifferenceToRerun t1 t2 r1 r2 then when argPrintSkippedPackages $ if argPrintTrials then printTrial "trial (skipping)" r1 r2 t1 t2 else putStrLn $ printf "%-*s (first run times were too similar)" nameColumnWidth (unPackageName pkg) else do when argPrintTrials $ printTrial "trial" r1 r2 t1 t2 (ts1, ts2, rs1, rs2) <- (unzip4 . ((t1, t2, r1, r2) :) <$>) . replicateM (argTrials - 1) $ do CabalTrial t1' r1' <- runCabal1 pkg CabalTrial t2' r2' <- runCabal2 pkg when argPrintTrials $ printTrial "trial" r1' r2' t1' t2' return (t1', t2', r1', r2') let result1 = combineTrialResults rs1 result2 = combineTrialResults rs2 times1 = V.fromList (map diffTimeToDouble ts1) times2 = V.fromList (map diffTimeToDouble ts2) mean1 = mean times1 mean2 = mean times2 stddev1 = stdDev times1 stddev2 = stdDev times2 speedup = mean1 / mean2 when argPrintTrials $ putStr $ printf "%-16s " "summary" if isSignificantResult result1 result2 || isSignificantTimeDifference argPValue ts1 ts2 then putStrLn $ printf "%-*s %-13s %-13s %10.3fs %10.3fs %10.3fs %10.3fs %10.3f" nameColumnWidth (unPackageName pkg) (show result1) (show result2) mean1 mean2 stddev1 stddev2 speedup else when (argPrintTrials || argPrintSkippedPackages) $ putStrLn $ printf "%-*s (not significant)" nameColumnWidth (unPackageName pkg) where checkArgs :: Args -> IO () checkArgs Args {..} = do let die msg = hPutStrLn stderr msg >> exitFailure unless (argTrials > 0) $ die "--trials must be greater than 0." unless (argMinRunTimeDifferenceToRerun >= 0) $ die "--min-run-time-percentage-difference-to-rerun must be non-negative." unless (isSampleLargeEnough argPValue argTrials) $ die "p-value is too small for the number of trials." printConfig :: Args -> IO () printConfig Args {..} = do putStrLn "Comparing:" putStrLn $ "1: " ++ argCabal1 ++ " " ++ unwords argCabal1Flags callProcess argCabal1 ["--version"] putStrLn $ "2: " ++ argCabal2 ++ " " ++ unwords argCabal2Flags callProcess argCabal2 ["--version"] -- TODO: Print index state. putStrLn "Base package database:" callProcess "ghc-pkg" ["list"] getPackages :: Args -> IO [PackageName] getPackages Args {..} = do pkgs <- if null argPackages then do putStrLn $ "Obtaining the package list (using " ++ argCabal1 ++ ") ..." list <- readProcess argCabal1 ["list", "--simple-output"] "" return $ nub [mkPackageName $ head (words line) | line <- lines list] else do putStrLn "Using given package list ..." return argPackages putStrLn $ "Done, got " ++ show (length pkgs) ++ " packages." return pkgs runCabal :: Int -> FilePath -> [String] -> PackageName -> IO CabalTrial runCabal timeoutSeconds cabal flags pkg = do ((exitCode, err), time) <- timeEvent $ do let timeout = "timeout --foreground -sINT " ++ show timeoutSeconds cabalCmd = unwords $ [cabal, "install", unPackageName pkg, "--dry-run", "-v0"] ++ flags cmd = (shell (timeout ++ " " ++ cabalCmd)) { std_err = CreatePipe } -- TODO: Read stdout and compare the install plans. (_, _, Just errh, ph) <- createProcess cmd err <- B.hGetContents errh (, err) <$> waitForProcess ph let exhaustiveMsg = "After searching the rest of the dependency tree exhaustively" result | exitCode == ExitSuccess = Solution | exitCode == ExitFailure 124 = Timeout | fromString exhaustiveMsg `B.isInfixOf` err = NoInstallPlan | fromString "Backjump limit reached" `B.isInfixOf` err = BackjumpLimit | fromString "There is no package named" `B.isInfixOf` err = PkgNotFound | otherwise = Unknown return (CabalTrial time result) isSampleLargeEnough :: PValue Double -> Int -> Bool isSampleLargeEnough pvalue trials = -- mannWhitneyUCriticalValue, which can fail with too few samples, is only -- used when both sample sizes are less than or equal to 20. trials > 20 || isJust (mannWhitneyUCriticalValue (trials, trials) pvalue) isSignificantTimeDifference :: PValue Double -> [NominalDiffTime] -> [NominalDiffTime] -> Bool isSignificantTimeDifference pvalue xs ys = let toVector = V.fromList . map diffTimeToDouble in case mannWhitneyUtest SamplesDiffer pvalue (toVector xs) (toVector ys) of Nothing -> error "not enough data for mannWhitneyUtest" Just Significant -> True Just NotSignificant -> False -- Should we stop after the first trial of this package to save time? This -- function skips the package if the results are uninteresting and the times are -- within --min-run-time-percentage-difference-to-rerun. shouldContinueAfterFirstTrial :: Double -> NominalDiffTime -> NominalDiffTime -> CabalResult -> CabalResult -> Bool shouldContinueAfterFirstTrial 0 _ _ _ _ = True shouldContinueAfterFirstTrial _ _ _ Timeout Timeout = False shouldContinueAfterFirstTrial maxRunTimeDifferenceToIgnore t1 t2 r1 r2 = isSignificantResult r1 r2 || abs (t1 - t2) / min t1 t2 >= realToFrac (maxRunTimeDifferenceToIgnore / 100) isSignificantResult :: CabalResult -> CabalResult -> Bool isSignificantResult r1 r2 = r1 /= r2 || not (isExpectedResult r1) -- Is this result expected in a benchmark run on all of Hackage? isExpectedResult :: CabalResult -> Bool isExpectedResult Solution = True isExpectedResult NoInstallPlan = True isExpectedResult BackjumpLimit = True isExpectedResult Timeout = True isExpectedResult PkgNotFound = False isExpectedResult Unknown = False -- Combine CabalResults from multiple trials. Ignoring timeouts, all results -- should be the same. If they aren't the same, we returns Unknown. combineTrialResults :: [CabalResult] -> CabalResult combineTrialResults rs | allEqual rs = head rs | allEqual [r | r <- rs, r /= Timeout] = Timeout | otherwise = Unknown where allEqual :: Eq a => [a] -> Bool allEqual xs = length (nub xs) == 1 timeEvent :: IO a -> IO (a, NominalDiffTime) timeEvent task = do start <- getCurrentTime r <- task end <- getCurrentTime return (r, diffUTCTime end start) diffTimeToDouble :: NominalDiffTime -> Double diffTimeToDouble = fromRational . toRational parserInfo :: ParserInfo Args parserInfo = info (argParser <**> helper) ( fullDesc <> progDesc ("Find differences between two cabal commands when solving" ++ " for all packages on Hackage.") <> header "hackage-benchmark" ) argParser :: Parser Args argParser = Args <$> strOption ( long "cabal1" <> metavar "PATH" <> help "First cabal executable") <*> strOption ( long "cabal2" <> metavar "PATH" <> help "Second cabal executable") <*> option (words <$> str) ( long "cabal1-flags" <> value [] <> metavar "FLAGS" <> help "Extra flags for the first cabal executable") <*> option (words <$> str) ( long "cabal2-flags" <> value [] <> metavar "FLAGS" <> help "Extra flags for the second cabal executable") <*> option (map mkPackageName . words <$> str) ( long "packages" <> value [] <> metavar "PACKAGES" <> help ("Space separated list of packages to test, or all of Hackage" ++ " if unspecified")) <*> option auto ( long "min-run-time-percentage-difference-to-rerun" <> showDefault <> value 0.0 <> metavar "PERCENTAGE" <> help ("Stop testing a package when the difference in run times in" ++ " the first trial are within this percentage, in order to" ++ " save time")) <*> option (mkPValue <$> auto) ( long "pvalue" <> showDefault <> value (mkPValue 0.05) <> metavar "DOUBLE" <> help ("p-value used to determine whether to print the results for" ++ " each package")) <*> option auto ( long "trials" <> showDefault <> value 10 <> metavar "N" <> help "Number of trials for each package") <*> switch ( long "print-trials" <> help "Whether to include the results from individual trials in the output") <*> switch ( long "print-skipped-packages" <> help "Whether to include skipped packages in the output") <*> option auto ( long "timeout" <> showDefault <> value 90 <> metavar "SECONDS" <> help "Maximum time to run a cabal command, in seconds")