module Main where import Control.Monad (when) import Criterion.Config import Criterion.Main import Data.Monoid (Last(..)) import Data.Map (Map) import qualified Data.Map as Map import System (getArgs, getProgName) import System.FilePath (()) import System.IO (stderr, hPutStrLn) import Toktok (Lexer, mkLexer, mkLexerWithSandhis) myConfig = defaultConfig { cfgSummaryFile = Last $ Just "benchmark.csv"} main = defaultMainWith myConfig (return ()) $ map createBGroup [ ("Transducer", mkLexerWithSandhis []) , ("Tries", mkLexer) , ("Baseline", baseline) , ("Dummy", dummy) ] createBGroup :: (String, [String] -> Lexer) -> Benchmark createBGroup (name, mkLexer) = bgroup name [ bench "English" $ nfIO $ mkBenchmark "english" mkLexer , bench "French" $ nfIO $ mkBenchmark "french" mkLexer ] mkBenchmark :: String -> ([String] -> Lexer) -> IO () mkBenchmark dir mklexerf = do lexicon <- readLexicon dir let lexer = mkLexer (" ":filter (not . null) lexicon) sts <- readSentences dir let results = map (not . null . lexer) sts --putStrLn $ unlines $ map show results when (not $ and results) $ error "Problem..." return () readLexicon :: String -> IO [String] readLexicon dir = readLineFiles $ "data" dir "lexicon.txt" readSentences :: String -> IO [String] readSentences dir = readLineFiles $ "data" dir "sentences.txt" readLineFiles :: FilePath -> IO [String] readLineFiles f = do t <- readFile f return $ lines t -- Lexers for comparaison -- | This is the standard haskell lexer, 'words'. -- It just split the string where there is white-spaces. dummy :: [String] -> Lexer dummy _ = return . words -- | this is a lexer based on haskell maps baseline :: [String] -> Lexer baseline ss = useMapLexer mapLexer where mapLexer = Map.fromList $ map (\x -> (x,True)) ss useMapLexer :: Map String Bool -> Lexer useMapLexer m = uml 1 where uml i s | i >= length s = [] uml i s = case Map.lookup (take i s) m of Just True -> [take i s:l | l <- uml 1 (drop i s)] ++ uml (i + 1) s _ -> uml (i + 1) s