Data/CharSet.hs

{-# LANGUAGE BangPatterns, CPP #-}
-----------------------------------------------------------------------------
-- |
-- Module      :  Data.CharSet
-- Copyright   :  (c) Edward Kmett 2010
-- License     :  BSD3
-- Maintainer  :  ekmett@gmail.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Encode unicode character sets as arbitrary precision floating point values
-- using the least character in the set as the exponent. Can efficiently represent
-- reasonably tightly grouped character sets, but may use up to 139KiB to represent
-- a particularly sparse set.
-- 
-- Designed to be imported qualified:
-- 
-- > import Data.CharSet (CharSet)
-- > import qualified Data.CharSet as CharSet
-------------------------------------------------------------------------------

module Data.CharSet
    ( 
    -- * CharSet
      CharSet
    , build
    -- * Manipulation
    , empty
    , singleton
    , full
    , union
    , intersection
    , complement
    , insert
    , delete
    , (\\)
    , fromList
    , fromDistinctAscList
    , toArray
    -- * Accessors
    , null
    , size
    , member
    , elem
    , notElem
    , isComplemented
    , toInteger
    -- * Builtins
    -- ** POSIX
    , posixAscii
    -- ** Unicode
    , UnicodeCategory(..)
    , unicodeCategories
    -- ** Data.Char classifiers
    , control, space, lower, upper, alpha, alphaNum
    , print, digit, octDigit, letter, mark, number
    , punctuation, symbol, separator, ascii, latin1, asciiUpper, asciiLower
    ) where

import Data.Array hiding (range)
import qualified Data.Bits as Bits
import Data.Bits hiding (complement)
import Data.Char
import Data.Data
import Data.Function (on)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Monoid (Monoid(..))
import Prelude hiding (null, exponent, toInteger, elem, notElem, print, pi)
import Text.Read

data CharSet = CS
        { _countAtLeast  :: {-# UNPACK #-} !Int       -- ^ A conservative upper bound on the element count.
                                                      --   If negative, we are complemented with respect to the universe
        , _countAtMost   :: {-# UNPACK #-} !Int       -- ^ A conservative lower bound on the element count.
                                                      --   If negative, we are complemented with respect to the universe
        , _count         :: Int                       -- ^ Lazy element count used when the above two disagree. O(1) environment size
        , exponent       :: {-# UNPACK #-} !Int       -- ^ Low water mark. index of the least element potentially in the set.
        , _hwm           :: {-# UNPACK #-} !Int       -- ^ High water mark. index of the greatest element potentially in the set.
        , mantissa       :: {-# UNPACK #-} !Integer   -- ^ the set of bits starting from the exponent.
                                                      --   if negative, then we are complemented with respect to universe
        }


ul, uh :: Char
ul = minBound
uh = maxBound
{-# INLINE ul #-}
{-# INLINE uh #-}

ol, oh :: Int
ol = fromEnum ul
oh = fromEnum uh
{-# INLINE ol #-}
{-# INLINE oh #-}

-- | Internal smart constructor. Forces count whenever it is pigeonholed.
bs :: Int -> Int -> Int -> Int -> Int -> Integer -> CharSet
bs !a !b c !l !h !m | a == b = CS a a a l h m 
                    | otherwise = CS a b c l h m 
{-# INLINE bs #-}

-- | /O(d)/ where /d/ is absolute deviation in fromEnum over the set
toList :: CharSet -> String
toList (CS _ _ _ l h m) 
    | m < 0 = map toEnum [ol..max (pred l) ol] ++ toList' l (map toEnum [min (succ h) oh..oh])
    | otherwise = toList' 0 []
    where
        toList' :: Int -> String -> String
        toList' !n t | n > h = t
                     | testBit m (n - l) = toEnum n : toList' (n+1) t
                     | otherwise         = toList' (n+1) t
{-# INLINE toList #-}

-- | /O(1)/ The empty set. Permits /O(1)/ null and size.
empty :: CharSet
empty = CS 0 0 0 0 0 0 
{-# INLINE empty #-}

-- | /O(1)/ Construct a @CharSet@ with a single element. Permits /O(1)/ null and size
singleton :: Char -> CharSet 
singleton x = CS 1 1 1 e e 1 where e = fromEnum x
{-# INLINE singleton #-}

-- | /O(1|d)/ Is the 'CharSet' empty? May be faster than checking if @'size' == 0@ after union.
--   Operations that require a recount are noted.
null :: CharSet -> Bool
null (CS a b c _ _ _) 
    | a > 0 = False
    | b == 0 = True
    | otherwise = c == 0 
{-# INLINE null #-}

-- | /O(1|d)/ The number of elements in the bit set.
size :: CharSet -> Int
size (CS a b c _ _ m)
    | (a == b) && (m >= 0) = a
    | a == b = oh - ol - a 
    | m >= 0 = c
    | otherwise = oh - ol - c 
{-# INLINE size #-}

-- | /O(d)/ A 'CharSet' containing every member of the enumeration of @a@.
full :: CharSet
full = complement empty 
{-# INLINE full #-}

-- | /O(d)/ Complements a 'CharSet' with respect to the bounds of @a@. Preserves order of 'null' and 'size'
complement :: CharSet -> CharSet 
complement (CS a b c l h m) = CS (Bits.complement b) (Bits.complement a) (Bits.complement c) l h (Bits.complement m)
{-# INLINE complement #-}

-- | /O(d * n)/ Make a 'CharSet' from a list of items.
fromList :: String -> CharSet
fromList = foldr insert empty 
{-# INLINE fromList #-}

-- | /O(d * n)/ Make a 'CharSet' from a distinct ascending list of items
fromDistinctAscList :: String -> CharSet 
fromDistinctAscList [] = empty
fromDistinctAscList (c:cs) = fromDistinctAscList' cs 1 0 1 
    where
        l = fromEnum c
        fromDistinctAscList' :: String -> Int -> Int -> Integer -> CharSet
        fromDistinctAscList' [] !n !h !m  = CS n n n l h m 
        fromDistinctAscList' (c':cs') !n _ !m = fromDistinctAscList' cs' (n+1) h' (setBit m (h' - l))
            where
                h' = fromEnum c'
{-# INLINE fromDistinctAscList #-}

-- | /O(d)/ Insert a single element of type @a@ into the 'CharSet'. Preserves order of 'null' and 'size'
insert :: Char -> CharSet -> CharSet
insert x r@(CS a b c l h m) 
    | (m < 0) && (e < l) = r 
    | (m < 0) && (e > h) = r
    | e < l = bs (a+1) (b+1) (c+1) e h (shiftL m (l - e) .|. 1)
    | e > h = bs (a+1) (b+1) (c+1) l p (setBit m p)
    | testBit m p = r 
    | otherwise = bs (a+1) (b+1) (c+1) l h (setBit m p)
    where 
        e = fromEnum x
        p = e - l 
{-# INLINE insert #-}

-- | /O(d)/ Delete a single item from the 'CharSet'. Preserves order of 'null' and 'size'
delete :: Char -> CharSet -> CharSet
delete x r@(CS a b c l h m) 
    | (m < 0) && (e < l) = bs (a+1) (b+1) (c+1) e h (shiftL m (l - e) .&. Bits.complement 1)
    | (m < 0) && (e > h) = bs (a+1) (b+1) (c+1) l p (clearBit m p)
    | e < l       = r
    | e > h       = r
    | testBit m p = bs (a-1) (b-1) (c-1) l h (clearBit m p)
    | otherwise   = r
    where 
        e = fromEnum x
        p = e - l
{-# INLINE delete #-}

-- | /O(1)/ Test for membership in a 'CharSet'
member :: Char -> CharSet -> Bool
member x (CS _ _ _ l h m) 
    | e < l     = m < 0 
    | e > h     = m > 0
    | otherwise = testBit m (e - l)
    where 
        e = fromEnum x
{-# INLINE member #-}

{-
notMember :: Char -> CharSet -> Bool
notMember x - not . member x
{-# INLINE notMember #-}
-}

-- | /O(1)/ Alias for member
elem :: Char -> CharSet -> Bool
elem = member
{-# INLINE elem #-}

-- | /O(1)/ Alias for notMember
notElem :: Char -> CharSet -> Bool
notElem x = not . elem x
{-# INLINE notElem #-}

-- | /O(d)/ convert to an Integer representation. Discards negative elements
toInteger :: CharSet -> Integer
toInteger x = mantissa x `shift` exponent x
{-# INLINE toInteger #-}

-- | /O(d)/. May force 'size' to take /O(d)/ if ranges overlap, preserves order of 'null'
union :: CharSet -> CharSet -> CharSet 
union x@(CS _ _ _ l _ _) y@(CS _ _ _ l' _ _)
    | l' < l        = union' y x -- ensure left side has lower exponent
    | otherwise     = union' x y 
{-# INLINE union #-}

union' :: CharSet -> CharSet -> CharSet 
union' x@(CS a b c l h m) y@(CS a' b' c' l' h' m')
    | b == 0        = y                                                         -- fast empty union
    | b' == 0       = x                                                         -- fast empty union
    | a == -1       = full                                                      -- fast full union
    | a' == -1      = full                                                      -- fast full union
    | (m < 0) && (m' < 0) = complement (intersection' (complement x) (complement y))  -- appeal to intersection
    | m' < 0        = complement (diff (complement y) x)                        -- union with complement
    | m < 0         = complement (diff (complement x) y)                        -- union with complement
    | h < l'        = bs (a + a') (b + b') (c + c') l h' m''                    -- disjoint positive ranges
    | otherwise     = bs (a `max` a') (b + b') (recount m'') l (h `max` h') m'' -- overlapped positives
    where 
        m'' = m .|. shiftL m' (l' - l)

-- | /O(1)/ check to see if we are represented as a complemented 'CharSet'. 
isComplemented :: CharSet -> Bool
isComplemented = (<0) . mantissa 
{-# INLINE isComplemented #-}

-- | /O(d)/. May force 'size' and 'null' both to take /O(d)/.
intersection :: CharSet -> CharSet -> CharSet 
intersection x@(CS _ _ _ l _ _) y@(CS _ _ _ l' _ _)
    | l' < l = intersection' y x
    | otherwise = intersection' x y
{-# INLINE intersection #-}

-- | /O(d)/. May force 'size' and 'null' both to take /O(d)/.
intersection' :: CharSet -> CharSet -> CharSet 
intersection' x@(CS a b _ l h m) y@(CS a' b' _ l' h' m')
    | b == 0  = empty
    | b' == 0 = empty
    | a == -1 = y
    | a' == -1 = x
    | (m < 0) && (m' < 0) = complement (union' (complement x) (complement y))
    | m' < 0 = diff x (complement y) 
    | m < 0  = diff y (complement x) 
    | h < l' = empty 
    | otherwise = bs 0 (b `min` b') (recount m'') l'' (h `min` h') m''
    where
        l'' = max l l'
        m'' = shift m (l'' - l) .&. shift m' (l'' - l')

-- | Unsafe internal method for computing differences 
-- preconditions:
--  m >= 0, m' >= 0, a /= -1, a' /= -1, b /= 0, b' /= 0
diff :: CharSet -> CharSet -> CharSet 
diff x@(CS a _ _ l h m) (CS _ b' _ l' h' m') 
    | h < l' = x
    | h' < l = x
    | otherwise = bs (max (a - b') 0) a (recount m'') l h m''
    where 
        m'' = m .&. shift (Bits.complement m') (l' - l)

-- | /O(d)/. Preserves order of 'null'. May force /O(d)/ 'size'.
difference :: CharSet -> CharSet -> CharSet 
difference x@(CS a b _ _ _ m)  y@(CS a' b' _ _ _ m') 
   | a == -1       = complement y
   | a' == -1      = empty
   | b == 0        = empty
   | b' == 0       = x
   | (m < 0) && (m' < 0) = diff (complement y) (complement x)
   | m < 0         = complement (complement x `union` y)
   | m' < 0        = x `union` complement y 
   | otherwise     = diff x y
    
-- | /O(d)/. Preserves order of 'null'. May force /O(d)/ 'size'.
(\\) :: CharSet -> CharSet -> CharSet 
(\\) = difference

instance Eq CharSet where
    x@(CS _ _ _ l _ m) == y@(CS _ _ _ l' _ m')
        | signum m == signum m' = shift m (l - l'') == shift m' (l - l'') 
        | m' < 0 = y == x
        | otherwise = mask .&. shift m (l - ol) == shift m' (l - ol)
        where 
            l'' = min l l'
            mask = setBit 0 (oh - ol + 1) - 1

instance Ord CharSet where
    compare = compare `on` toInteger

instance Bounded CharSet where
    minBound = empty
    maxBound = CS n n n ol oh m
        where
            n = oh - ol + 1
            m = setBit 0 n - 1

-- | Return a charset based on a character range
range :: Char -> Char -> CharSet
range l h 
    | l <= h    = CS n n n l' h' m
    | otherwise = empty
    where 
        l' = fromEnum l
        h' = fromEnum h
        n = h' - l' + 1
        m = setBit 0 n - 1

-- | /O(d)/
recount :: Integer -> Int
recount !n 
    | n < 0     = Bits.complement (recount (Bits.complement n))
    | otherwise = recount' 0 0 
    where
        h = hwm n
        recount' !i !c
            | i > h = c
            | otherwise = recount' (i+1) (if testBit n i then c+1 else c)

-- | /O(d)/. Computes the equivalent of (truncate . logBase 2 . abs) extended with 0 at 0
-- This could be computed faster by directly appealing to GMP, but that is tricky in GHC.
hwm :: Integer -> Int
hwm !n 
    | n < 0 = hwm (-n)
    | n > 1 = scan p (2*p) 
    | otherwise = 0
    where
        p = probe 1
        -- incrementally compute 2^(2^(i+1)) until it exceeds n
        probe :: Int -> Int
        probe !i
            | bit (2*i) > n = i
            | otherwise     = probe (2*i)

        -- then binary search the powers for the highest set bit
        scan :: Int -> Int -> Int
        scan !l !h
            | l == h = l
            | bit (m+1) > n = scan l m
            | otherwise = scan (m+1) h
            where m = l + (h - l) `div` 2

toArray :: CharSet -> Array Char Bool
toArray set = array (minBound, maxBound) $ fmap (\x -> (x, x `elem` set)) [minBound .. maxBound]
 
instance Show CharSet where
   showsPrec d x@(CS _ _ _ _ _ m)
        | m < 0     = showParen (d > 10) $ showString "complement " . showsPrec 11 (complement x)
        | otherwise = showParen (d > 10) $ showString "fromDistinctAscList " . showsPrec 11 (toList x)


instance Read CharSet where
#ifdef __GLASGOW_HASKELL__ 
    readPrec = parens $ complemented +++ normal 
      where
        complemented = prec 10 $ do 
                Ident "complement" <- lexP
                complement `fmap` step readPrec
        normal = prec 10 $ do
                Ident "fromDistinctAscList" <- lexP
                fromDistinctAscList `fmap` step readPrec
#else
    readsPrec d r = 
        readParen (d > 10) (\r -> [ (complement m, t) 
                                  | ("complement", s) <- lex r
                                  , (m, t) <- readsPrec 11 s]) r
     ++ readParen (d > 10) (\r -> [ (fromDistinctAscList m, t) 
                                  | ("fromDistinctAscList", s) <- lex r
                                  , (m, t) <- readsPrec 11 s]) r
#endif

instance Monoid CharSet where
    mempty = empty
    mappend = union

build :: (Char -> Bool) -> CharSet
build p = fromDistinctAscList $ filter p [minBound .. maxBound]

-- :digit:, etc.
posixAscii :: Map String CharSet
posixAscii = Map.fromList
    [ ("alnum", alnum')
    , ("alpha", alpha')
    , ("blank", fromList " \t")
    , ("cntrl", insert '\x7f' $ range '\x00' '\x1f')
    , ("digit", digit')
    , ("graph", range '\x21' '\x7e')
    , ("print", range '\x20' '\x7e')
    , ("word",  insert '_' alnum')
    , ("punct", fromList "-!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~")
    , ("space", fromList " \t\r\n\v\f")
    , ("upper", upper')
    , ("lower", lower')
    , ("xdigit", digit `union` range 'a' 'f' `union` range 'A' 'F')
    ]
    where
        lower' = range 'a' 'z'
        upper' = range 'A' 'Z'
        alpha' = lower' `union` upper'
        digit' = range '0' '9'
        alnum' = alpha' `union` digit'

data UnicodeCategory = UnicodeCategory String String CharSet String

-- \p{Letter} or \p{Mc}
unicodeCategories :: [UnicodeCategory]
unicodeCategories =
    [ UnicodeCategory "Letter" "L" l "any kind of letter from any language."
    ,     UnicodeCategory "Lowercase_Letter" "Ll" ll "a lowercase letter that has an uppercase variant"
    ,     UnicodeCategory "Uppercase_Letter" "Lu" lu "an uppercase letter that has a lowercase variant"
    ,     UnicodeCategory "Titlecase_Letter" "Lt" lt "a letter that appears at the start of a word when only the first letter of the word is capitalized"
    ,     UnicodeCategory "Letter&" "L&" la "a letter that exists in lowercase and uppercase variants (combination of Ll, Lu and Lt)"
    ,     UnicodeCategory "Modifier_Letter" "Lm" lm "a special character that is used like a letter"
    ,     UnicodeCategory "Other_Letter" "Lo" lo "a letter or ideograph that does not have lowercase and uppercase variants"
    , UnicodeCategory "Mark" "M" m "a character intended to be combined with another character (e.g. accents, umlauts, enclosing boxes, etc.)"
    ,     UnicodeCategory "Non_Spacing_Mark" "Mn" mn "a character intended to be combined with another character without taking up extra space (e.g. accents, umlauts, etc.)"
    ,     UnicodeCategory "Spacing_Combining_Mark" "Mc" mc "a character intended to be combined with another character that takes up extra space (vowel signs in many Eastern languages)"
    ,     UnicodeCategory "Enclosing_Mark" "Me" me "a character that encloses the character is is combined with (circle, square, keycap, etc.)"
    , UnicodeCategory "Separator" "Z" z "any kind of whitespace or invisible separator"
    ,     UnicodeCategory "Space_Separator" "Zs" zs "a whitespace character that is invisible, but does take up space"
    ,     UnicodeCategory "Line_Separator" "Zl" zl "line separator character U+2028"
    ,     UnicodeCategory "Paragraph_Separator" "Zp" zp "paragraph separator character U+2029"
    , UnicodeCategory "Symbol" "S" s "math symbols, currency signs, dingbats, box-drawing characters, etc."
    ,     UnicodeCategory "Math_Symbol" "Sm" sm "any mathematical symbol"
    ,     UnicodeCategory "Currency_Symbol" "Sc" sc "any currency sign"
    ,     UnicodeCategory "Modifier_Symbol" "Sk" sk "a combining character (mark) as a full character on its own"
    ,     UnicodeCategory "Other_Symbol" "So" so "various symbols that are not math symbols, currency signs, or combining characters"
    , UnicodeCategory "Number" "N" n "any kind of numeric character in any script"
    ,     UnicodeCategory "Decimal_Digit_Number" "Nd" nd "a digit zero through nine in any script except ideographic scripts"
    ,     UnicodeCategory "Letter_Number" "Nl" nl "a number that looks like a letter, such as a Roman numeral"
    ,     UnicodeCategory "Other_Number" "No" no "a superscript or subscript digit, or a number that is not a digit 0..9 (excluding numbers from ideographic scripts)"
    , UnicodeCategory "Punctuation" "P" p "any kind of punctuation character"
    ,     UnicodeCategory "Dash_Punctuation" "Pd" pd "any kind of hyphen or dash"
    ,     UnicodeCategory "Open_Punctuation" "Ps" ps "any kind of opening bracket"
    ,     UnicodeCategory "Close_Punctuation" "Pe" pe "any kind of closing bracket"
    ,     UnicodeCategory "Initial_Punctuation" "Pi" pi "any kind of opening quote"
    ,     UnicodeCategory "Final_Punctuation" "Pf" pf "any kind of closing quote"
    ,     UnicodeCategory "Connector_Punctuation" "Pc" pc "a punctuation character such as an underscore that connects words"
    ,     UnicodeCategory "Other_Punctuation" "Po" po "any kind of punctuation character that is not a dash, bracket, quote or connector"
    , UnicodeCategory "Other" "C" c "invisible control characters and unused code points"
    ,     UnicodeCategory "Control" "Cc" cc "an ASCII 0x00..0x1F or Latin-1 0x80..0x9F control character"
    ,     UnicodeCategory "Format" "Cf" cf "invisible formatting indicator"
    ,     UnicodeCategory "Private_Use" "Co" co "any code point reserved for private use"
    ,     UnicodeCategory "Surrogate" "Cs" cs "one half of a surrogate pair in UTF-16 encoding"
    ,     UnicodeCategory "Unassigned" "Cn" cn "any code point to which no character has been assigned.properties" ]
    where
        cat category = build ((category ==) . generalCategory)
        ll = cat LowercaseLetter
        lu = cat UppercaseLetter
        lt = cat TitlecaseLetter
        la = ll `union` lu `union` lt
        lm = cat ModifierLetter
        lo = cat OtherLetter
        l = la `union` lm `union` lo
        mn = cat NonSpacingMark
        mc = cat SpacingCombiningMark
        me = cat EnclosingMark
        m = mn `union` mc `union` me
        zs = cat Space
        zl = cat LineSeparator
        zp = cat ParagraphSeparator
        z = zs `union` zl `union` zp
        sm = cat MathSymbol
        sc = cat CurrencySymbol
        sk = cat ModifierSymbol
        so = cat OtherSymbol
        s = sm `union` sc `union` sk `union` so
        nd = cat DecimalNumber
        nl = cat LetterNumber
        no = cat OtherNumber
        n = nd `union` nl `union` no
        pd = cat DashPunctuation
        ps = cat OpenPunctuation
        pe = cat ClosePunctuation
        pi = cat InitialQuote
        pf = cat FinalQuote
        pc = cat ConnectorPunctuation
        po = cat OtherPunctuation
        p = pd `union` ps `union` pe `union` pi `union` pf `union` pc `union` po
        cc = cat Control
        cf = cat Format
        co = cat PrivateUse
        cs = cat Surrogate
        cn = cat NotAssigned
        c = cc `union` cf `union` co `union` cs `union` cn
        
-- Haskell character classes from Data.Char
control, space, lower, upper, alpha, alphaNum, print, digit, octDigit, letter, mark, number, punctuation, symbol, separator, ascii, latin1, asciiUpper, asciiLower :: CharSet
control = build isControl
space = build isSpace
lower = build isLower
upper = build isUpper
alpha = build isAlpha
alphaNum = build isAlphaNum
print = build isPrint
digit = build isDigit
octDigit = build isOctDigit
letter = build isLetter
mark = build isMark
number = build isNumber
punctuation = build isPunctuation
symbol = build isSymbol
separator = build isSeparator
ascii = build isAscii
latin1 = build isLatin1
asciiUpper = build isAsciiUpper
asciiLower = build isAsciiLower

instance Typeable CharSet where
    typeOf _ = mkTyConApp charSetTyCon []

charSetTyCon :: TyCon
charSetTyCon = mkTyCon "Data.CharSet.CharSet"
{-# NOINLINE charSetTyCon #-}

instance Data CharSet where
    gfoldl k z set | isComplemented set = z complement `k` complement set
                   | otherwise          = z fromList `k` toList set
    toConstr set 
        | isComplemented set = complementConstr
        | otherwise = fromListConstr

    dataTypeOf _ = charSetDataType

    gunfold k z c = case constrIndex c of
        1 -> k (z fromList)
        2 -> k (z complement)
        _ -> error "gunfold"
        
fromListConstr :: Constr
fromListConstr   = mkConstr charSetDataType "fromList" [] Prefix
{-# NOINLINE fromListConstr #-}

complementConstr :: Constr
complementConstr = mkConstr charSetDataType "complement" [] Prefix
{-# NOINLINE complementConstr #-}

charSetDataType :: DataType
charSetDataType  = mkDataType "Data.CharSet.CharSet" [fromListConstr, complementConstr]
{-# NOINLINE charSetDataType #-}