{-# LANGUAGE CPP #-} {-# LANGUAGE BangPatterns #-} #if !defined(TESTING) && defined(__GLASGOW_HASKELL__) {-# LANGUAGE Trustworthy #-} #endif {-# OPTIONS_HADDOCK not-home #-} ----------------------------------------------------------------------------- -- | -- Copyright : (c) Daan Leijen 2002 -- License : BSD-style -- -- = WARNING -- -- This module is considered __internal__. -- -- The Package Versioning Policy __does not apply__. -- -- This contents of this module may change __in any way whatsoever__ -- and __without any warning__ between minor versions of this package. -- -- Authors importing this module are expected to track development -- closely. -- -- = Description -- -- An efficient implementation of sets. -- -- These modules are intended to be imported qualified, to avoid name -- clashes with Prelude functions, e.g. -- -- > import Data.Set (Set) -- > import qualified Data.Set as Set -- -- The implementation of 'Set' is based on /size balanced/ binary trees (or -- trees of /bounded balance/) as described by: -- -- * Stephen Adams, \"/Efficient sets: a balancing act/\", -- Journal of Functional Programming 3(4):553-562, October 1993, -- . -- * J. Nievergelt and E.M. Reingold, -- \"/Binary search trees of bounded balance/\", -- SIAM journal of computing 2(1), March 1973. -- -- Bounds for 'union', 'intersection', and 'difference' are as given -- by -- -- * Guy Blelloch, Daniel Ferizovic, and Yihan Sun, -- \"/Just Join for Parallel Ordered Sets/\", -- . -- -- Note that the implementation is /left-biased/ -- the elements of a -- first argument are always preferred to the second, for example in -- 'union' or 'insert'. Of course, left-biasing can only be observed -- when equality is an equivalence relation instead of structural -- equality. -- -- /Warning/: The size of the set must not exceed @maxBound::Int@. Violation of -- this condition is not detected and if the size limit is exceeded, the -- behavior of the set is completely undefined. -- -- @since 0.5.9 ----------------------------------------------------------------------------- -- [Note: Using INLINABLE] -- ~~~~~~~~~~~~~~~~~~~~~~~ -- It is crucial to the performance that the functions specialize on the Ord -- type when possible. GHC 7.0 and higher does this by itself when it sees th -- unfolding of a function -- that is why all public functions are marked -- INLINABLE (that exposes the unfolding). -- [Note: Using INLINE] -- ~~~~~~~~~~~~~~~~~~~~ -- For other compilers and GHC pre 7.0, we mark some of the functions INLINE. -- We mark the functions that just navigate down the tree (lookup, insert, -- delete and similar). That navigation code gets inlined and thus specialized -- when possible. There is a price to pay -- code growth. The code INLINED is -- therefore only the tree navigation, all the real work (rebalancing) is not -- INLINED by using a NOINLINE. -- -- All methods marked INLINE have to be nonrecursive -- a 'go' function doing -- the real work is provided. -- [Note: Type of local 'go' function] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- If the local 'go' function uses an Ord class, it sometimes heap-allocates -- the Ord dictionary when the 'go' function does not have explicit type. -- In that case we give 'go' explicit type. But this slightly decrease -- performance, as the resulting 'go' function can float out to top level. -- [Note: Local 'go' functions and capturing] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- As opposed to IntSet, when 'go' function captures an argument, increased -- heap-allocation can occur: sometimes in a polymorphic function, the 'go' -- floats out of its enclosing function and then it heap-allocates the -- dictionary and the argument. Maybe it floats out too late and strictness -- analyzer cannot see that these could be passed on stack. -- [Note: Order of constructors] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- The order of constructors of Set matters when considering performance. -- Currently in GHC 7.0, when type has 2 constructors, a forward conditional -- jump is made when successfully matching second constructor. Successful match -- of first constructor results in the forward jump not taken. -- On GHC 7.0, reordering constructors from Tip | Bin to Bin | Tip -- improves the benchmark by up to 10% on x86. module Data.Set.Private ( -- * Set type Set(..) -- instance Eq,Ord,Show,Read,Data,Typeable , Size , insertBy' , empty ) where import Prelude hiding (filter,foldl,foldr,null,map,take,drop,splitAt) import Control.Monad (join) #if __GLASGOW_HASKELL__ import GHC.Exts ( lazy ) #endif {-------------------------------------------------------------------- Sets are size balanced trees --------------------------------------------------------------------} -- | A set of values @a@. -- See Note: Order of constructors data Set a = Bin {-# UNPACK #-} !Size !a !(Set a) !(Set a) | Tip type Size = Int {-------------------------------------------------------------------- Query --------------------------------------------------------------------} -- | /O(1)/. The number of elements in the set. size :: Set a -> Int size Tip = 0 size (Bin sz _ _ _) = sz {-------------------------------------------------------------------- Construction --------------------------------------------------------------------} -- | /O(1)/. The empty set. empty :: Set a empty = Tip -- | /O(1)/. Create a singleton set. singleton :: a -> Set a singleton x = Bin 1 x Tip Tip {-------------------------------------------------------------------- Insertion, Deletion --------------------------------------------------------------------} -- | /O(log n)/. Insert an element in a set. -- If the set already contains an element equal to the given value, -- it is replaced with the new value. -- See Note: Type of local 'go' function -- See Note: Avoiding worker/wrapper (in Data.Map.Internal) insertBy' :: (a -> a -> Ordering) -> a -> Set a -> Maybe (Set a) insertBy' compare = join go where go orig !_ Tip = Just $! singleton (lazy orig) go orig !x (Bin _ y l r) = case compare x y of LT -> (\ !l' -> balanceL y l' r) <$!> go orig x l GT -> (\ !r' -> balanceR y l r') <$!> go orig x r EQ -> Nothing #if __GLASGOW_HASKELL__ {-# INLINABLE insertBy' #-} #else {-# INLINE insertBy' #-} #endif infixl 4 <$!> (<$!>) :: (a -> b) -> Maybe a -> Maybe b (<$!>) f = \ case Nothing -> Nothing Just a -> Just $! f a #ifndef __GLASGOW_HASKELL__ lazy :: a -> a lazy a = a #endif {-------------------------------------------------------------------- [balance x l r] balances two trees with value x. The sizes of the trees should balance after decreasing the size of one of them. (a rotation). [delta] is the maximal relative difference between the sizes of two trees, it corresponds with the [w] in Adams' paper. [ratio] is the ratio between an outer and inner sibling of the heavier subtree in an unbalanced setting. It determines whether a double or single rotation should be performed to restore balance. It is correspondes with the inverse of $\alpha$ in Adam's article. Note that according to the Adam's paper: - [delta] should be larger than 4.646 with a [ratio] of 2. - [delta] should be larger than 3.745 with a [ratio] of 1.534. But the Adam's paper is errorneous: - it can be proved that for delta=2 and delta>=5 there does not exist any ratio that would work - delta=4.5 and ratio=2 does not work That leaves two reasonable variants, delta=3 and delta=4, both with ratio=2. - A lower [delta] leads to a more 'perfectly' balanced tree. - A higher [delta] performs less rebalancing. In the benchmarks, delta=3 is faster on insert operations, and delta=4 has slightly better deletes. As the insert speedup is larger, we currently use delta=3. --------------------------------------------------------------------} delta,ratio :: Int delta = 3 ratio = 2 -- The balance function is equivalent to the following: -- -- balance :: a -> Set a -> Set a -> Set a -- balance x l r -- | sizeL + sizeR <= 1 = Bin sizeX x l r -- | sizeR > delta*sizeL = rotateL x l r -- | sizeL > delta*sizeR = rotateR x l r -- | otherwise = Bin sizeX x l r -- where -- sizeL = size l -- sizeR = size r -- sizeX = sizeL + sizeR + 1 -- -- rotateL :: a -> Set a -> Set a -> Set a -- rotateL x l r@(Bin _ _ ly ry) | size ly < ratio*size ry = singleL x l r -- | otherwise = doubleL x l r -- rotateR :: a -> Set a -> Set a -> Set a -- rotateR x l@(Bin _ _ ly ry) r | size ry < ratio*size ly = singleR x l r -- | otherwise = doubleR x l r -- -- singleL, singleR :: a -> Set a -> Set a -> Set a -- singleL x1 t1 (Bin _ x2 t2 t3) = bin x2 (bin x1 t1 t2) t3 -- singleR x1 (Bin _ x2 t1 t2) t3 = bin x2 t1 (bin x1 t2 t3) -- -- doubleL, doubleR :: a -> Set a -> Set a -> Set a -- doubleL x1 t1 (Bin _ x2 (Bin _ x3 t2 t3) t4) = bin x3 (bin x1 t1 t2) (bin x2 t3 t4) -- doubleR x1 (Bin _ x2 t1 (Bin _ x3 t2 t3)) t4 = bin x3 (bin x2 t1 t2) (bin x1 t3 t4) -- -- It is only written in such a way that every node is pattern-matched only once. -- -- Only balanceL and balanceR are needed at the moment, so balance is not here anymore. -- In case it is needed, it can be found in Data.Map. -- Functions balanceL and balanceR are specialised versions of balance. -- balanceL only checks whether the left subtree is too big, -- balanceR only checks whether the right subtree is too big. -- balanceL is called when left subtree might have been inserted to or when -- right subtree might have been deleted from. balanceL :: a -> Set a -> Set a -> Set a balanceL x l r = case r of Tip -> case l of Tip -> Bin 1 x Tip Tip (Bin _ _ Tip Tip) -> Bin 2 x l Tip (Bin _ lx Tip (Bin _ lrx _ _)) -> Bin 3 lrx (Bin 1 lx Tip Tip) (Bin 1 x Tip Tip) (Bin _ lx ll@(Bin _ _ _ _) Tip) -> Bin 3 lx ll (Bin 1 x Tip Tip) (Bin ls lx ll@(Bin lls _ _ _) lr@(Bin lrs lrx lrl lrr)) | lrs < ratio*lls -> Bin (1+ls) lx ll (Bin (1+lrs) x lr Tip) | otherwise -> Bin (1+ls) lrx (Bin (1+lls+size lrl) lx ll lrl) (Bin (1+size lrr) x lrr Tip) (Bin rs _ _ _) -> case l of Tip -> Bin (1+rs) x Tip r (Bin ls lx ll lr) | ls > delta*rs -> case (ll, lr) of (Bin lls _ _ _, Bin lrs lrx lrl lrr) | lrs < ratio*lls -> Bin (1+ls+rs) lx ll (Bin (1+rs+lrs) x lr r) | otherwise -> Bin (1+ls+rs) lrx (Bin (1+lls+size lrl) lx ll lrl) (Bin (1+rs+size lrr) x lrr r) (_, _) -> error "Failure in Data.Map.balanceL" | otherwise -> Bin (1+ls+rs) x l r {-# NOINLINE balanceL #-} -- balanceR is called when right subtree might have been inserted to or when -- left subtree might have been deleted from. balanceR :: a -> Set a -> Set a -> Set a balanceR x l r = case l of Tip -> case r of Tip -> Bin 1 x Tip Tip (Bin _ _ Tip Tip) -> Bin 2 x Tip r (Bin _ rx Tip rr@(Bin _ _ _ _)) -> Bin 3 rx (Bin 1 x Tip Tip) rr (Bin _ rx (Bin _ rlx _ _) Tip) -> Bin 3 rlx (Bin 1 x Tip Tip) (Bin 1 rx Tip Tip) (Bin rs rx rl@(Bin rls rlx rll rlr) rr@(Bin rrs _ _ _)) | rls < ratio*rrs -> Bin (1+rs) rx (Bin (1+rls) x Tip rl) rr | otherwise -> Bin (1+rs) rlx (Bin (1+size rll) x Tip rll) (Bin (1+rrs+size rlr) rx rlr rr) (Bin ls _ _ _) -> case r of Tip -> Bin (1+ls) x l Tip (Bin rs rx rl rr) | rs > delta*ls -> case (rl, rr) of (Bin rls rlx rll rlr, Bin rrs _ _ _) | rls < ratio*rrs -> Bin (1+ls+rs) rx (Bin (1+ls+rls) x l rl) rr | otherwise -> Bin (1+ls+rs) rlx (Bin (1+ls+size rll) x l rll) (Bin (1+rrs+size rlr) rx rlr rr) (_, _) -> error "Failure in Data.Map.balanceR" | otherwise -> Bin (1+ls+rs) x l r {-# NOINLINE balanceR #-}