-- ----------------------------------------------------------------------------- -- -- CharSet.hs, part of Alex -- -- (c) Chris Dornan 1995-2000, Simon Marlow 2003 -- -- An abstract CharSet type for Alex. To begin with we'll use Alex's -- original definition of sets as functions, then later will -- transition to something that will work better with Unicode. -- -- ----------------------------------------------------------------------------} {-# LANGUAGE OverloadedLists #-} module CharSet ( setSingleton, Encoding(..), Byte, ByteSet, byteSetSingleton, byteRanges, byteSetRange, CharSet, -- abstract emptyCharSet, charSetSingleton, charSet, charSetMinus, charSetComplement, charSetRange, charSetUnion, charSetQuote, setUnions, byteSetToArray, byteSetElems, byteSetElem ) where import Data.Array ( Array, array ) import Data.Char ( chr, ord ) import Data.Maybe ( catMaybes ) import Data.Word ( Word8 ) import Data.List.NonEmpty ( pattern (:|), (<|) ) import qualified Data.List.NonEmpty as List1 import UTF8 ( List1, encode ) import Data.Ranged ( Boundary( BoundaryAbove, BoundaryAboveAll, BoundaryBelow, BoundaryBelowAll ) , DiscreteOrdered, Range( Range ), RSet , makeRangedSet , rSetDifference, rSetEmpty, rSetHas, rSetNegation, rSetRanges, rSetUnion, rSingleton ) -- import Data.Semigroup (sconcat) -- import qualified Data.Foldable as Fold type Byte = Word8 -- Implementation as functions type CharSet = RSet Char type ByteSet = RSet Byte -- type Utf8Set = RSet [Byte] type Utf8Range = Span (List1 Byte) data Encoding = Latin1 | UTF8 deriving (Eq, Show) emptyCharSet :: CharSet emptyCharSet = rSetEmpty byteSetElem :: ByteSet -> Byte -> Bool byteSetElem = rSetHas charSetSingleton :: Char -> CharSet charSetSingleton = rSingleton setSingleton :: DiscreteOrdered a => a -> RSet a setSingleton = rSingleton charSet :: [Char] -> CharSet charSet = setUnions . fmap charSetSingleton charSetMinus :: CharSet -> CharSet -> CharSet charSetMinus = rSetDifference charSetUnion :: CharSet -> CharSet -> CharSet charSetUnion = rSetUnion setUnions :: DiscreteOrdered a => [RSet a] -> RSet a setUnions = foldr rSetUnion rSetEmpty charSetComplement :: CharSet -> CharSet charSetComplement = rSetNegation charSetRange :: Char -> Char -> CharSet charSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] {-# INLINE bytes #-} bytes :: [Byte] bytes = [minBound..maxBound] byteSetToArray :: ByteSet -> Array Byte Bool byteSetToArray set = array (minBound, maxBound) [(c, rSetHas set c) | c <- bytes] byteSetElems :: ByteSet -> [Byte] byteSetElems set = filter (rSetHas set) bytes charToRanges :: Encoding -> CharSet -> [Utf8Range] charToRanges Latin1 = map (fmap ((:| []) . fromIntegral . ord)) -- Span [Byte] . catMaybes . fmap (charRangeToCharSpan False) . rSetRanges charToRanges UTF8 = concat -- Span [Byte] . fmap toUtfRange -- [Span [Byte]] . fmap (fmap UTF8.encode) -- Span [Byte] . catMaybes . fmap (charRangeToCharSpan True) . rSetRanges -- | Turns a range of characters expressed as a pair of UTF-8 byte sequences into a set of ranges, in which each range of the resulting set is between pairs of sequences of the same length toUtfRange :: Span (List1 Byte) -> [Span (List1 Byte)] toUtfRange (Span x y) = List1.toList $ fix x y fix :: List1 Byte -> List1 Byte -> List1 (Span (List1 Byte)) fix x y | length x == length y = [Span x y] | length x == 1 = Span x [0x7F] <| fix [0xC2,0x80] y | length x == 2 = Span x [0xDF,0xBF] <| fix [0xE0,0x80,0x80] y | length x == 3 = Span x [0xEF,0xBF,0xBF] <| fix [0xF0,0x80,0x80,0x80] y | otherwise = error "fix: incorrect input given" byteRangeToBytePair :: Span a -> (a, a) byteRangeToBytePair (Span x y) = (x, y) data Span a = Span a a -- lower bound inclusive, higher bound exclusive -- (SDM: upper bound inclusive, surely?) instance Functor Span where fmap f (Span x y) = Span (f x) (f y) charRangeToCharSpan :: Bool -> Range Char -> Maybe (Span Char) charRangeToCharSpan _ (Range BoundaryAboveAll _) = Nothing charRangeToCharSpan _ (Range (BoundaryAbove c) _) | c == maxBound = Nothing charRangeToCharSpan _ (Range _ BoundaryBelowAll) = Nothing charRangeToCharSpan _ (Range _ (BoundaryBelow c)) | c == minBound = Nothing charRangeToCharSpan uni (Range x y) = Just (Span (l x) (h y)) where l b = case b of BoundaryBelowAll -> '\0' BoundaryBelow a -> a BoundaryAbove a -> succ a BoundaryAboveAll -> error "panic: charRangeToCharSpan" h b = case b of BoundaryBelowAll -> error "panic: charRangeToCharSpan" BoundaryBelow a -> pred a BoundaryAbove a -> a BoundaryAboveAll | uni -> chr 0x10ffff | otherwise -> chr 0xff byteRanges :: Encoding -> CharSet -> [(List1 Byte, List1 Byte)] byteRanges enc = fmap byteRangeToBytePair . charToRanges enc byteSetRange :: Byte -> Byte -> ByteSet byteSetRange c1 c2 = makeRangedSet [Range (BoundaryBelow c1) (BoundaryAbove c2)] byteSetSingleton :: Byte -> ByteSet byteSetSingleton = rSingleton -- TODO: More efficient generated code! charSetQuote :: CharSet -> String charSetQuote s = "(\\c -> " ++ foldr (\x y -> x ++ " || " ++ y) "False" (map quoteRange (rSetRanges s)) ++ ")" where quoteRange (Range l h) = quoteL l ++ " && " ++ quoteH h quoteL (BoundaryAbove a) = "c > " ++ show a quoteL (BoundaryBelow a) = "c >= " ++ show a quoteL (BoundaryAboveAll) = "False" quoteL (BoundaryBelowAll) = "True" quoteH (BoundaryAbove a) = "c <= " ++ show a quoteH (BoundaryBelow a) = "c < " ++ show a quoteH (BoundaryAboveAll) = "True" quoteH (BoundaryBelowAll) = "False"