-- Character classification {-# LANGUAGE CPP #-} module Ctype ( is_ident -- Char# -> Bool , is_symbol -- Char# -> Bool , is_any -- Char# -> Bool , is_space -- Char# -> Bool , is_lower -- Char# -> Bool , is_upper -- Char# -> Bool , is_digit -- Char# -> Bool , is_alphanum -- Char# -> Bool , is_decdigit, is_hexdigit, is_octdigit, is_bindigit , hexDigit, octDecDigit ) where #include "HsVersions.h" import GhcPrelude import Data.Int ( Int32 ) import Data.Bits ( Bits((.&.)) ) import Data.Char ( ord, chr ) import Panic -- Bit masks cIdent, cSymbol, cAny, cSpace, cLower, cUpper, cDigit :: Int cIdent = 1 cSymbol = 2 cAny = 4 cSpace = 8 cLower = 16 cUpper = 32 cDigit = 64 -- | The predicates below look costly, but aren't, GHC+GCC do a great job -- at the big case below. {-# INLINE is_ctype #-} is_ctype :: Int -> Char -> Bool is_ctype mask c = (fromIntegral (charType c) .&. fromIntegral mask) /= (0::Int32) is_ident, is_symbol, is_any, is_space, is_lower, is_upper, is_digit, is_alphanum :: Char -> Bool is_ident = is_ctype cIdent is_symbol = is_ctype cSymbol is_any = is_ctype cAny is_space = is_ctype cSpace is_lower = is_ctype cLower is_upper = is_ctype cUpper is_digit = is_ctype cDigit is_alphanum = is_ctype (cLower+cUpper+cDigit) -- Utils hexDigit :: Char -> Int hexDigit c | is_decdigit c = ord c - ord '0' | otherwise = ord (to_lower c) - ord 'a' + 10 octDecDigit :: Char -> Int octDecDigit c = ord c - ord '0' is_decdigit :: Char -> Bool is_decdigit c = c >= '0' && c <= '9' is_hexdigit :: Char -> Bool is_hexdigit c = is_decdigit c || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') is_octdigit :: Char -> Bool is_octdigit c = c >= '0' && c <= '7' is_bindigit :: Char -> Bool is_bindigit c = c == '0' || c == '1' to_lower :: Char -> Char to_lower c | c >= 'A' && c <= 'Z' = chr (ord c - (ord 'A' - ord 'a')) | otherwise = c -- | We really mean .|. instead of + below, but GHC currently doesn't do -- any constant folding with bitops. *sigh* charType :: Char -> Int charType c = case c of '\0' -> 0 -- \000 '\1' -> 0 -- \001 '\2' -> 0 -- \002 '\3' -> 0 -- \003 '\4' -> 0 -- \004 '\5' -> 0 -- \005 '\6' -> 0 -- \006 '\7' -> 0 -- \007 '\8' -> 0 -- \010 '\9' -> cSpace -- \t (not allowed in strings, so !cAny) '\10' -> cSpace -- \n (ditto) '\11' -> cSpace -- \v (ditto) '\12' -> cSpace -- \f (ditto) '\13' -> cSpace -- ^M (ditto) '\14' -> 0 -- \016 '\15' -> 0 -- \017 '\16' -> 0 -- \020 '\17' -> 0 -- \021 '\18' -> 0 -- \022 '\19' -> 0 -- \023 '\20' -> 0 -- \024 '\21' -> 0 -- \025 '\22' -> 0 -- \026 '\23' -> 0 -- \027 '\24' -> 0 -- \030 '\25' -> 0 -- \031 '\26' -> 0 -- \032 '\27' -> 0 -- \033 '\28' -> 0 -- \034 '\29' -> 0 -- \035 '\30' -> 0 -- \036 '\31' -> 0 -- \037 '\32' -> cAny + cSpace -- '\33' -> cAny + cSymbol -- ! '\34' -> cAny -- " '\35' -> cAny + cSymbol -- # '\36' -> cAny + cSymbol -- $ '\37' -> cAny + cSymbol -- % '\38' -> cAny + cSymbol -- & '\39' -> cAny + cIdent -- ' '\40' -> cAny -- ( '\41' -> cAny -- ) '\42' -> cAny + cSymbol -- * '\43' -> cAny + cSymbol -- + '\44' -> cAny -- , '\45' -> cAny + cSymbol -- - '\46' -> cAny + cSymbol -- . '\47' -> cAny + cSymbol -- / '\48' -> cAny + cIdent + cDigit -- 0 '\49' -> cAny + cIdent + cDigit -- 1 '\50' -> cAny + cIdent + cDigit -- 2 '\51' -> cAny + cIdent + cDigit -- 3 '\52' -> cAny + cIdent + cDigit -- 4 '\53' -> cAny + cIdent + cDigit -- 5 '\54' -> cAny + cIdent + cDigit -- 6 '\55' -> cAny + cIdent + cDigit -- 7 '\56' -> cAny + cIdent + cDigit -- 8 '\57' -> cAny + cIdent + cDigit -- 9 '\58' -> cAny + cSymbol -- : '\59' -> cAny -- ; '\60' -> cAny + cSymbol -- < '\61' -> cAny + cSymbol -- = '\62' -> cAny + cSymbol -- > '\63' -> cAny + cSymbol -- ? '\64' -> cAny + cSymbol -- @ '\65' -> cAny + cIdent + cUpper -- A '\66' -> cAny + cIdent + cUpper -- B '\67' -> cAny + cIdent + cUpper -- C '\68' -> cAny + cIdent + cUpper -- D '\69' -> cAny + cIdent + cUpper -- E '\70' -> cAny + cIdent + cUpper -- F '\71' -> cAny + cIdent + cUpper -- G '\72' -> cAny + cIdent + cUpper -- H '\73' -> cAny + cIdent + cUpper -- I '\74' -> cAny + cIdent + cUpper -- J '\75' -> cAny + cIdent + cUpper -- K '\76' -> cAny + cIdent + cUpper -- L '\77' -> cAny + cIdent + cUpper -- M '\78' -> cAny + cIdent + cUpper -- N '\79' -> cAny + cIdent + cUpper -- O '\80' -> cAny + cIdent + cUpper -- P '\81' -> cAny + cIdent + cUpper -- Q '\82' -> cAny + cIdent + cUpper -- R '\83' -> cAny + cIdent + cUpper -- S '\84' -> cAny + cIdent + cUpper -- T '\85' -> cAny + cIdent + cUpper -- U '\86' -> cAny + cIdent + cUpper -- V '\87' -> cAny + cIdent + cUpper -- W '\88' -> cAny + cIdent + cUpper -- X '\89' -> cAny + cIdent + cUpper -- Y '\90' -> cAny + cIdent + cUpper -- Z '\91' -> cAny -- [ '\92' -> cAny + cSymbol -- backslash '\93' -> cAny -- ] '\94' -> cAny + cSymbol -- ^ '\95' -> cAny + cIdent + cLower -- _ '\96' -> cAny -- ` '\97' -> cAny + cIdent + cLower -- a '\98' -> cAny + cIdent + cLower -- b '\99' -> cAny + cIdent + cLower -- c '\100' -> cAny + cIdent + cLower -- d '\101' -> cAny + cIdent + cLower -- e '\102' -> cAny + cIdent + cLower -- f '\103' -> cAny + cIdent + cLower -- g '\104' -> cAny + cIdent + cLower -- h '\105' -> cAny + cIdent + cLower -- i '\106' -> cAny + cIdent + cLower -- j '\107' -> cAny + cIdent + cLower -- k '\108' -> cAny + cIdent + cLower -- l '\109' -> cAny + cIdent + cLower -- m '\110' -> cAny + cIdent + cLower -- n '\111' -> cAny + cIdent + cLower -- o '\112' -> cAny + cIdent + cLower -- p '\113' -> cAny + cIdent + cLower -- q '\114' -> cAny + cIdent + cLower -- r '\115' -> cAny + cIdent + cLower -- s '\116' -> cAny + cIdent + cLower -- t '\117' -> cAny + cIdent + cLower -- u '\118' -> cAny + cIdent + cLower -- v '\119' -> cAny + cIdent + cLower -- w '\120' -> cAny + cIdent + cLower -- x '\121' -> cAny + cIdent + cLower -- y '\122' -> cAny + cIdent + cLower -- z '\123' -> cAny -- { '\124' -> cAny + cSymbol -- | '\125' -> cAny -- } '\126' -> cAny + cSymbol -- ~ '\127' -> 0 -- \177 _ -> panic ("charType: " ++ show c)