{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP          #-}
#if __GLASGOW_HASKELL__ >=704
{-# LANGUAGE Safe         #-}
#elif __GLASGOW_HASKELL__ >=702
{-# LANGUAGE Trustworthy  #-}
#endif
-- | Sets of characters.
--
-- Using this is more efficint than 'RE.Type.Alt':ng individual characters.
module RERE.CharSet (
    -- * Set of characters
    CharSet,
    -- * Construction
    empty,
    universe,
    singleton,
    insert,
    union,
    intersection,
    complement,
    difference,
    -- * Query
    size,
    null,
    member,
    -- * Conversions
    fromList,
    toList,
    fromIntervalList,
    toIntervalList,
    ) where

import Prelude hiding (null)

import Data.Char   (chr, ord)
import Data.List   (foldl', sortBy)
import Data.String (IsString (..))

#if MIN_VERSION_containers(0,5,0)
import qualified Data.IntMap.Strict as IM
#else
import qualified Data.IntMap as IM
#endif

-- | A set of 'Char's.
--
-- We use range set, which works great with 'Char'.
newtype CharSet = CS { unCS :: IM.IntMap Int }
  deriving (Eq, Ord)

-- | 
--
-- >>> "foobar" :: CharSet
-- "abfor"
--
instance IsString CharSet where
    fromString = fromList

instance Show CharSet where
    showsPrec d cs
        | size cs < 20
        = showsPrec d (toList cs)
        | otherwise
        = showParen (d > 10)
        $ showString "CS.fromIntervalList "
        . showsPrec 11 (toIntervalList cs)

-- | Empty character set.
empty :: CharSet
empty = CS IM.empty

-- | universe
--
-- >>> size universe
-- 1114112
--
-- >>> universe
-- CS.fromIntervalList [('\NUL','\1114111')]
--
universe :: CharSet
universe = CS $ IM.singleton 0 0x10ffff

-- | Check whether 'CharSet' is 'empty'.
null :: CharSet -> Bool
null (CS cs) = IM.null cs

-- | Size of 'CharSet'
--
-- >>> size $ fromIntervalList [('a','f'), ('0','9')]
-- 16
--
-- >>> length $ toList $ fromIntervalList [('a','f'), ('0','9')]
-- 16
--
size :: CharSet -> Int
size (CS m) = foldl' (\ !acc (lo, hi) -> acc + (hi - lo) + 1) 0 (IM.toList m)

-- | Singleton character set.
singleton :: Char -> CharSet
singleton c = CS (IM.singleton (ord c) (ord c))

-- | Test whether character is in the set.
member :: Char -> CharSet -> Bool
#if MIN_VERSION_containers(0,5,0)
member c (CS m) = case IM.lookupLE i m of
    Nothing      -> False
    Just (_, hi) -> i <= hi
  where
#else
member c (CS m) = go (IM.toList m)
  where
    go [] = False
    go ((x,y):zs) = (x <= i && i <= y) || go zs
#endif
    i = ord c

-- | Insert 'Char' into 'CharSet'.
insert :: Char -> CharSet -> CharSet
insert c (CS m) = normalise (IM.insert (ord c) (ord c) m)

-- | Union of two 'CharSet's.
union :: CharSet -> CharSet -> CharSet
union (CS xs) (CS ys) = normalise (IM.unionWith max xs ys)

-- | Intersection of two 'CharSet's
intersection :: CharSet -> CharSet -> CharSet
intersection (CS xs) (CS ys) = CS $
    IM.fromList (intersectRangeList (IM.toList xs) (IM.toList ys))

-- | Compute the intersection.
intersectRangeList :: Ord a => [(a, a)] -> [(a, a)] -> [(a, a)]
intersectRangeList aset@((x,y):as) bset@((u,v):bs)
   | y < u     = intersectRangeList as bset
   | v < x     = intersectRangeList aset bs
   | y < v     = (max x u, y) : intersectRangeList as bset
   | otherwise = (max x u, v) : intersectRangeList aset bs
intersectRangeList _ [] = []
intersectRangeList [] _ = []

-- | Complement of a CharSet
complement :: CharSet -> CharSet
complement (CS xs) = CS $ IM.fromList $ complementRangeList (IM.toList xs)

-- | Compute the complement intersected with @[x,)@ assuming @x<u@.
complementRangeList' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList' x ((u,v):s) = (x,pred u) : complementRangeList'' v s
complementRangeList' x []        = [(x,0x10ffff)]

-- | Compute the complement intersected with @(x,)@.
complementRangeList'' :: Int -> [(Int, Int)] -> [(Int, Int)]
complementRangeList'' x s
    | x == 0x10ffff = []
    | otherwise     = complementRangeList' (succ x) s

-- | Compute the complement.
--
-- Note: we treat Ints as codepoints, i.e minBound is 0, and maxBound is 0x10ffff
complementRangeList :: [(Int, Int)] -> [(Int, Int)]
complementRangeList s@((x,y):s')
    | x == 0    = complementRangeList'' y s'
    | otherwise = complementRangeList' 0 s
complementRangeList [] = [(0, 0x10ffff)]

-- | Difference of two 'CharSet's.
difference :: CharSet -> CharSet -> CharSet
difference xs ys = intersection xs (complement ys)

-- | Make 'CharSet' from a list of characters, i.e. 'String'.
fromList :: String -> CharSet
fromList = normalise . foldl' (\ acc c -> IM.insert (ord c) (ord c) acc) IM.empty

-- | Convert 'CharSet' to a list of characters i.e. 'String'.
toList :: CharSet -> String
toList = concatMap (uncurry enumFromTo) . toIntervalList

-- | Convert to interval list
--
-- >>> toIntervalList $ union "01234" "56789"
-- [('0','9')]
--
toIntervalList :: CharSet -> [(Char, Char)]
toIntervalList (CS m) = [ (chr lo, chr hi) | (lo, hi) <- IM.toList m ]

-- | Convert from interval pairs.
--
-- >>> fromIntervalList []
-- ""
--
-- >>> fromIntervalList [('a','f'), ('0','9')]
-- "0123456789abcdef"
--
-- >>> fromIntervalList [('Z','A')]
-- ""
--
fromIntervalList :: [(Char,Char)] -> CharSet
fromIntervalList xs = normalise' $ sortBy (\a b -> compare (fst a) (fst b))
    [ (ord lo, ord hi)
    | (lo, hi) <- xs
    , lo <= hi
    ]

-------------------------------------------------------------------------------
-- Normalisation
-------------------------------------------------------------------------------

normalise :: IM.IntMap Int -> CharSet
normalise = normalise'. IM.toList

normalise' :: [(Int,Int)] -> CharSet
normalise' = CS . IM.fromList . go where
    go :: [(Int,Int)] -> [(Int,Int)]
    go []         = []
    go ((x,y):zs) = go' x y zs

    go' :: Int -> Int -> [(Int, Int)] -> [(Int, Int)]
    go' lo hi [] = [(lo, hi)]
    go' lo hi ws0@((u,v):ws)
        | u <= succ hi = go' lo (max v hi) ws
        | otherwise    = (lo,hi) : go ws0