{-- Benchmarks. We re-use the 'Language' generator of thr test suite and compare how long 'SimpleParser' and 'Parsec' take to parse all words of a language, which is written to a file before parsing, to ensure that both parsers start from an equally-evaluated input. --} {-# LANGUAGE OverloadedStrings, RankNTypes, FlexibleContexts, TypeApplications #-} import GenLanguage import GenCsv import qualified Test.QuickCheck as Q import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as TIO import Data.Set (Set) import qualified Data.Set as Set import Data.Void (Void) import qualified Data.Csv as Cassava import qualified Data.ByteString.Lazy as B import qualified Data.Foldable as Fold import Text.Megaparsec import Text.Megaparsec.Char.Lexer (decimal,signed) import Text.Megaparsec.Simple import Test.Tasty.Bench (Benchmark,bench,bgroup,bcompare,envWithCleanup,nfIO,defaultMain) import System.Directory (removeFile) import System.IO.Temp (emptySystemTempFile) import Control.DeepSeq (NFData(..)) import Control.Monad (mzero) -- * hard-coded test benchmark parameters -- | size parameters passed to the language generator 'randomLanguages' useLangSizes :: [Int] useLangSizes = three `mtimes` (10:[100,120..200]) where mtimes :: Monad m => m () -> m a -> m a mtimes scalar m = m >>= (\a -> fmap (const a) scalar) three = [(),(),()] -- generate three languages of each size -- | number of rows in randomly generated CSV file csvNumRows :: Int csvNumRows = 100000 :: Int -- | number of randomly generated CSV tabke structures csvNumStructures :: Int csvNumStructures = 5 -- * parser benchmark type data ParserBenchmark a = ParserBenchmark { parseFromFile :: FilePath, withParsec :: IO [a], withSimpleParse :: IO [a], withCassava :: IO [a] } instance NFData (ParserBenchmark a) where rnf = rnf . parseFromFile -- ^ 'envWithCleanup' requires its type parameter to be member of 'NFData' bot IO actions do not have a normal form. It is enough if the 'testFile' is written. genBenchmark :: NFData b => (a -> String) -> (a -> IO (ParserBenchmark b)) -> a -> Benchmark genBenchmark name toBenchmark a = envWithCleanup setUp cleanUp genBench where setUp = toBenchmark a cleanUp = removeFile . parseFromFile genBench pb = bgroup (name a) [ (bench "Parsec" (nfIO (withParsec pb))), (bcompare "Parsec" (bench "SimpleParser" (nfIO (withSimpleParse pb)))), (bcompare "Parsec" (bench "Cassava" (nfIO (withCassava pb))))] -- * benchmark language parsers writeAllWords :: FilePath -> Language (Set Text) -> IO () writeAllWords file = TIO.writeFile file . foldl1 (\txt w -> txt <> "\n" <> w) . allWords -- | read the file written by 'writeAllWords' -- and parse using the 'wordsParser', -- printing the parsed words to stdout. -- -- @ -- parseAllWordsWith (runParser @Void) -- parseAllWordsWith simpleParse -- @ parseAllWordsWith :: (MonadParsec Void Text p) => (forall a. p a -> String -> Text -> result a) -> Language x -> FilePath -> IO (result [Text]) parseAllWordsWith runP lang testFile = fmap (runP (wordsParser lang) testFile) (TIO.readFile testFile) -- | we keep this paremetric so that -- we don't have to mention the megaparsec error type -- which changed through library versions. ignoreParsecError :: Either err [a] -> [a] ignoreParsecError = either (const []) id ignoreNothing :: Maybe [a] -> [a] ignoreNothing = maybe [] id -- | generate a parser for words separated by newline characters wordsParser :: MonadParsec Void Text p => Language x -> p [Text] wordsParser lang = let langWord = genParser lang in (langWord `sepBy` single '\n') <* eof simpleParse :: SimpleParser Text a -> String -> Text -> Maybe a simpleParse p _ s = fmap fst (runSimpleParser p s) langDepth :: Language x -> Int langDepth (Word _) = 0 langDepth (Choice _ x y) = 1 + max (langDepth x) (langDepth y) langDepth (Concat _ x y) = 1 + langDepth x + langDepth y benchName :: Language (Set Text) -> String benchName lang = let ws = allWords lang in "language of depth " <> show (langDepth lang) <> " and size " <> show (Set.size ws) <> " e.g. " <> T.unpack (Set.findMin ws) -- | Generate as many 'Language's as integers given, -- where the integer is the size parameter passed to QuickCheck's -- 'Q.Gen'erator. randomLanguages :: [Int] -> IO [Language (Set Text)] randomLanguages sizes = Q.generate (traverse (flip Q.resize Q.arbitrary) sizes) -- | 'writeAllWords' and return two actions that parse the same test file, -- to be compared in the benchmark. genBenchmarkable :: Language (Set Text) -> IO (ParserBenchmark Text) genBenchmarkable lang = do testFile <- emptySystemTempFile "parserBenchmark.txt" writeAllWords testFile lang return $ ParserBenchmark testFile (fmap ignoreParsecError (parseAllWordsWith (runParser @Void) lang testFile)) (fmap ignoreNothing (parseAllWordsWith simpleParse lang testFile)) (fmap (const []) (return ())) genBenchmarks :: IO Benchmark genBenchmarks = do langs <- randomLanguages useLangSizes return (bgroup "language parser comparison (cassava not applicable here)" (fmap (genBenchmark benchName genBenchmarkable) langs)) -- * benchmark CSV parsers data Comma = LeadingComma | TrailingComma -- | skip over uninteresting fields skipFields :: MonadParsec Void Text p => Int -> Comma -> p () skipFields n comma = if n <= 0 then return () else () <$ (count n skipField) where co = case comma of TrailingComma -> (\p -> p <* single ',') LeadingComma -> (\p -> single ',' *> p) skipField = co (takeWhileP (Just "some csv field") (\c -> c /= ',' && c /= '\n')) parsecBool :: MonadParsec Void Text p => p Bool parsecBool = (True <$ chunk "True") <|> (False <$ chunk "False") -- | The 'CSVStructure' dictates that in column 'fooCol' an integer -- and in column 'barCol' a boolean is to be found. parsecCSVrow :: MonadParsec Void Text p => CSVStructure -> p (Int,Bool) parsecCSVrow c = do _ <- skipFields (fooCol c) TrailingComma i <- signed (return ()) decimal <* single ',' _ <- skipFields (barCol c - fooCol c - 1) TrailingComma b <- parsecBool _ <- skipFields (tableWidth c - 1 - barCol c) LeadingComma return (i,b) -- | We extract and decode two columns from a CSV file of up two 50 columns parsecCSV :: MonadParsec Void Text p => CSVStructure -> p [(Int,Bool)] parsecCSV c = takeWhileP (Just "header line") ('\n' /=) *> single '\n' *> (parsecCSVrow c `sepEndBy` single '\n') <* eof -- analogous to 'parseAllWordsWith' parseCsvWith :: (MonadParsec Void Text p) => (forall a. p a -> String -> Text -> result a) -> CSVStructure -> FilePath -> IO (result [(Int,Bool)]) parseCsvWith runP struct testFile = fmap (runP (parsecCSV struct) testFile) (TIO.readFile testFile) csvBenchName :: CSVStructure -> String csvBenchName c = "CSV file with " <> show (tableWidth c) <> " columns " <> "and Int in column " <> show (fooCol c) <> " and Bool in column " <> show (barCol c) newtype LitBool = LitBool Bool instance NFData LitBool where rnf (LitBool b) = rnf b instance Cassava.FromField LitBool where parseField s | s == "True" = pure (LitBool True) | s == "False" = pure (LitBool False) | otherwise = mzero data IntBool = IntBool !Int !LitBool instance NFData IntBool where rnf (IntBool i b) = rnf (i,b) instance Cassava.FromNamedRecord IntBool where parseNamedRecord r = IntBool <$> r Cassava..: "foo" <*> r Cassava..: "bar" fromIntBool :: IntBool -> (Int,Bool) fromIntBool (IntBool i (LitBool b)) = (i,b) -- decode using cassava. cassavaIntBool :: FilePath -> IO [(Int,Bool)] cassavaIntBool testFile = fmap dec (B.readFile testFile) where dec = either (const []) (Fold.toList . fmap fromIntBool .snd) . Cassava.decodeByName -- analogous to 'genBenchmarkable' csvBenchmarkable :: CSVStructure -> IO (ParserBenchmark (Int,Bool)) csvBenchmarkable c = do content <- Q.generate (genCSVFile c csvNumRows) testFile <- emptySystemTempFile "parserBenchmark.csv" TIO.writeFile testFile content return $ ParserBenchmark testFile (fmap ignoreParsecError (parseCsvWith (runParser @Void) c testFile)) (fmap ignoreNothing (parseCsvWith simpleParse c testFile)) ( cassavaIntBool testFile) genCsvBenchmarks :: IO Benchmark genCsvBenchmarks = do structures <- Q.generate (sequence (replicate csvNumStructures genCSVStructure)) return (bgroup "csv parser comparison" (fmap (genBenchmark csvBenchName csvBenchmarkable) structures)) -- * main main :: IO () main = do bench1 <- genBenchmarks bench2 <- genCsvBenchmarks defaultMain [bench1,bench2]