{-# LANGUAGE PackageImports , TupleSections , MultiParamTypeClasses , FlexibleInstances #-}

{- |

Run comparative benchmarks against other Haskell implementations.

The only other common implementation in Hackage is @case-insensitive@. That
algorithm and a simple one using @map toLower@ (and similar implementations)
are included in these benchmarks.

The random strings generated are limited to the printable ASCII characters.
The ratios of alphabetic and upper-case characters in the strings are part of
the randomization of the strings. The ratios chosen are shown below. The
character makeup of the test strings appears to have only slight effect on any
performance differences between the algorithms tested.

The lengths of the test strings are a little more impactful on the benches, so
two options are explored: short (random length) and long. The short strings are
likely what will be matched in the real world. The case folding routines
probably have similar performance, so the differences in string handling in
general and in allocation in particular exaggerate the difference in benchmark
performance. Even the lazily evaluated string in the @case-insensitive@ library
is noticeably slower because a second string is allocated.

The biggest difference in speed is for string comparisons that fail. When
matching short strings in a larger stream, the non-match is probably the
dominant case. The 'CaseInsensitiveEq' algorithm is by far the fastest.

Benchmarks available form a four-dimensional matrix accessible in the
tree-like, path-based style of @Criterion@'s defaultMain command line
handling.

> bench-others -m glob ByteString.Lazy/short/*/*

The axes and values are:

 - string type: data type of strings to compare

    -- @String@
    -- @ByteString@
    -- @ByteString.Lazy@
    -- @Text@
    -- @Text.Lazy@

 - string length:
 
    -- @short@    - 1000 different random-length (2-12 chars) strings
    -- @long@     - 20 different 500-char strings

 - string case: comparing the original with ...

    -- @id@        - identical strings
    -- @toLower@   - a lower-case version of the original
    -- @toUpper@   - an upper-case version of the original
    -- @flipCase@  - all alphabetic characters in the opposite case
    -- @similar@   - identical at the beginning, then different
    -- @different@ - different string from the first character

 - match algorithm:

    -- @FoldCase@           - compare two case-folded strings using @base@ library
    -- @CaseInsensitiveEq@  - compare using this library
    -- @CIeq@               - compare using the @case-insensitive@ package
    -- @CIeq'@              - a smarter use of @case-insensitive@, assuming one string is a literal in client code
    -- @FoldCaseOrd@        - ordering or two case-folded strings using the @base@ library
    -- @CaseInsensitiveOrd@ - ordering using this library

Note on @CIeq'@: There is some overhead to using 'CI.mk". In most cases, one of
the two strings being compared will be a string literal in the client source
code. We can assume that 'CI.mk' will be called for the literal at most once,
if not never, during execution. The @CIeq'@ comparison algorithm (sometimes the
fastest) models this case by evaluating one half of the comparison before the
benchmark begins. This might also be informative for the @FoldCase@ algorithm,
but it's not explored here.

-}

module Main ( main ) where

import           Test.RandomStrings
import           Test.RandomStrings.FlipCase
import           Criterion.Main

import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BS8
import qualified Data.ByteString.Lazy as BSL
import qualified Data.ByteString.Lazy.Char8 as BSL8
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL

import           Data.Ratio ( (%) )
import           Data.Word
import           Data.Char
import           Data.String ( IsString(..) )

import qualified "case-insensitive" Data.CaseInsensitive as CI
import           Data.CaseInsensitive.Eq
import           Data.CaseInsensitive.Ord


------------------------------------------------------------------------------
-- 
-- | 'main' first generates some random test strings and then organizes bench
--   groups in a tree organized by
--
--   @/ types / length / case / method@
--
main = do
    strings <- mapM mk_strings
        [ StringLengthShort
        , StringLengthLong
        ]

    defaultMain 
        [ bgroup "String"           $ map (mk_length ""       ) strings
        , bgroup "ByteString"       $ map (mk_length BS.empty ) strings
        , bgroup "ByteString.Lazy"  $ map (mk_length BSL.empty) strings
        , bgroup "Text"             $ map (mk_length T.empty  ) strings
        , bgroup "Text.Lazy"        $ map (mk_length TL.empty ) strings
        ]


------------------------------------------------------------------------------
-- 
-- generating random test strings
--

-- | Test strings are all printable characters in the 7-bit ASCII range. The
--   mix is 9/10 alphabetic and the alphabetic characters are 1/6 upper case.
--   If the mix of character classes made much difference in benchmark
--   results, I'd add these as command line parameters. To try a different
--   mix, modify it here.
string_generator = randomString' randomASCII (9%10) (1%6)

data StringLengthOption = StringLengthLong | StringLengthShort deriving Eq

mk_strings :: StringLengthOption -> IO (StringLengthOption,[String])
mk_strings StringLengthShort = randomStringsLen string_generator (2,12) 1000 >>= \s -> return (StringLengthShort,s)
mk_strings StringLengthLong  = randomStrings (string_generator 500) 20 >>= \s -> return (StringLengthLong,s)


------------------------------------------------------------------------------
--
-- | CaseOption describes the change made to the original string to make a
--   testable pair of strings. Most create a pair that should be equivalent
--   in case-insensitive comparisons.
data CaseOption = CaseId | CaseToLower | CaseToUpper | CaseFlipCase | CaseSimilar | CaseDifferent

-- | Creating the pairs used by the comparison algorithms. The extra 'Bool' is
--   what the result of a case-insensitive comparison should be.
mk_pair :: CaseOption -> String -> ((String,String),Bool)
mk_pair CaseId a = ((a,a),True)
mk_pair CaseToLower a = ((a,map toLower a),True)
mk_pair CaseToUpper a = ((a,map toUpper a),True)
mk_pair CaseFlipCase a = ((a,map flipCase a),True)
mk_pair CaseSimilar a = ((a, mk_similar a), False)
mk_pair CaseDifferent a = ((a,mk_different a),False)


-- | Make a different String after a 'random' number of chars.
--   Guaranteed to be different from the original.
mk_similar [] = []
mk_similar as@(a:_) = (take n as') ++ (map succ as'')
    where
        n = (fromEnum a) `mod` (length as)
        (as',as'') = splitAt n as

-- | Make a completely different string than the original.
mk_different = map succ


pack_pair :: (IsString a) => a -> ((String,String),b) -> ((a,a),b)
pack_pair _ ((a,b),c) = ((fromString a,fromString b),c)

ci_mk_pair :: (CI.FoldCase a) => ((a,a),b) -> ((CI.CI a,a),b)
ci_mk_pair ((a,b),c) = ((CI.mk a,b),c)

-- | make a bench group for a StringLengthOption
mk_length :: (BenchType a) => a -> (StringLengthOption,[String]) -> Benchmark
mk_length s (len_opt,strings) = bgroup (bench_name len_opt) $
    map (mk_case s strings)
        [ CaseId        -- the original string
        , CaseToLower   -- a lower-case version of the original
        , CaseToUpper   -- an upper-case version of the original
        , CaseFlipCase  -- a version with inverted case for alphabetic chars
        , CaseSimilar   -- a version that is initially equivalent, but not fully so
        , CaseDifferent -- a completely different string from the first character
        ]

------------------------------------------------------------------------------
--
-- The leaf nodes evaluate different comparison algorithms on the pairs of
-- randomly generated string. These are the actual benchmark tests, and the
-- pairs of strings being generated are fully constructed before the tests
-- because they are the final parameter of 'nf'.


-- | make a bench group for a CaseOption
mk_case :: (BenchType a) => a -> [String] -> CaseOption -> Benchmark
mk_case s strings case_opt = bgroup (bench_name case_opt)
    [ bench "FoldCase"           $ nf (map $ \((a,b),tf) -> (((fold_case_bench a) == (fold_case_bench b)) == tf || assert_fail)) $ (map (pack_pair s) pairs)
    , bench "CaseInsensitiveEq"  $ nf (map $ \((a,b),tf) -> ((caseInsensitiveEq a b                     ) == tf || assert_fail)) $ (map (pack_pair s) pairs)
    , bench "CIeq"               $ nf (map $ \((a,b),tf) -> (((CI.mk a) == (CI.mk b)                    ) == tf || assert_fail)) $ (map (pack_pair s) pairs)
    , bench "CIeq'"              $ nf (map $ \((a,b),tf) -> ((a == (CI.mk b)                            ) == tf || assert_fail)) $ (map ci_mk_pair $ map (pack_pair s) pairs)
    , bench "FoldCaseOrd"        $ nf (map $ \((a,b),tf) -> ((compare (fold_case_bench a) (fold_case_bench b) == EQ) == tf || assert_fail)) $ (map (pack_pair s) pairs)
    , bench "CaseInsensitiveOrd" $ nf (map $ \((a,b),tf) -> ((caseInsensitiveCompare a b                      == EQ) == tf || assert_fail)) $ (map (pack_pair s) pairs)
    ]
    where
        pairs = map (mk_pair case_opt) strings
        assert_fail = error "internal: assertion failed"


------------------------------------------------------------------------------
--
-- classes


-- | A class for string types to be benchmarked. Both 'CaseInsensitiveEq' and
--   'CI.FoldCase' are ready to be tested as-is, but the case-folding algorithm
--   that only relies on the @base@ package requires an implementation of
--   case-folding for each string type.
class (CI.FoldCase a, CaseInsensitiveEq a, Eq a, CaseInsensitiveOrd a, Ord a, IsString a) => BenchType a where
    fold_case_bench :: a -> a

instance BenchType String where
    fold_case_bench = map toLower

instance BenchType BS.ByteString where
    fold_case_bench = BS8.map toLower

instance BenchType BSL.ByteString where
    fold_case_bench = BSL8.map toLower

instance BenchType T.Text where
    fold_case_bench = T.map toLower

instance BenchType TL.Text where
    fold_case_bench = TL.map toLower


-- | A helper class for naming branches of the bench tree.
class NamedBench a where bench_name :: a -> String

instance NamedBench String where bench_name = const "String"
instance NamedBench BS.ByteString where bench_name = const "ByteString"
instance NamedBench BSL.ByteString where bench_name = const "ByteString.Lazy"
instance NamedBench T.Text where bench_name = const "Text"
instance NamedBench TL.Text where bench_name = const "Text.Lazy"

instance NamedBench StringLengthOption where
    bench_name StringLengthLong = "Long"
    bench_name StringLengthShort = "Short"

instance NamedBench CaseOption where
    bench_name CaseId = "id"
    bench_name CaseToLower = "toLower"
    bench_name CaseToUpper = "toUpper"
    bench_name CaseFlipCase = "flipCase"
    bench_name CaseSimilar = "similar"
    bench_name CaseDifferent = "different"