{-# LANGUAGE Haskell2010 #-}
module Data.Monoid.Instances.ByteString.UTF8 (
ByteStringUTF8(..), decode
)
where
import Control.Exception (assert)
import Data.Bits ((.&.), (.|.), shiftL, shiftR)
import Data.Char (chr, ord, isDigit, isPrint)
import qualified Data.Foldable as Foldable
import qualified Data.List as List
import Data.Maybe (fromMaybe, isJust, isNothing)
import Data.String (IsString(fromString))
import Data.Word (Word8)
import Data.ByteString (ByteString)
import qualified Data.ByteString as ByteString
import qualified Data.ByteString.Char8 as ByteString.Char8
import Data.ByteString.Internal (w2c)
import Data.ByteString.Unsafe (unsafeDrop, unsafeHead, unsafeTail, unsafeTake, unsafeIndex)
import Data.Semigroup
import Data.Monoid (Monoid(mempty, mappend))
import Data.Monoid.Cancellative (LeftReductiveMonoid(..), LeftCancellativeMonoid, LeftGCDMonoid(..))
import Data.Monoid.Null (MonoidNull(..), PositiveMonoid)
import Data.Monoid.Factorial (FactorialMonoid(..))
import Data.Monoid.Textual (TextualMonoid(..))
import qualified Data.Monoid.Factorial as Factorial (FactorialMonoid(..))
import qualified Data.Monoid.Textual as Textual (TextualMonoid(..))
import Prelude hiding (any, drop, dropWhile, foldl, foldl1, foldr, foldr1, scanl, scanr, scanl1, scanr1,
map, concatMap, break, span)
newtype ByteStringUTF8 = ByteStringUTF8 ByteString deriving (Eq, Ord)
decode :: ByteString -> (ByteStringUTF8, ByteString)
decode bs
| ByteString.null bs || l < 0x80 = (ByteStringUTF8 bs, mempty)
| l >= 0xC0 = (ByteStringUTF8 (ByteString.init bs), ByteString.singleton l)
| ByteString.null prefix = (mempty, bs)
| otherwise =
case toChar (ByteString.last prefix) suffix
of Nothing -> (ByteStringUTF8 (ByteString.init prefix), drop (ByteString.length prefix - 1) bs)
Just{} -> (ByteStringUTF8 bs, mempty)
where (prefix, suffix) = ByteString.breakEnd byteStartsCharacter bs
l = ByteString.last bs
instance Semigroup ByteStringUTF8 where
ByteStringUTF8 a <> ByteStringUTF8 b = ByteStringUTF8 (a <> b)
{-# INLINE (<>) #-}
instance Monoid ByteStringUTF8 where
mempty = ByteStringUTF8 ByteString.empty
{-# INLINE mempty #-}
ByteStringUTF8 a `mappend` ByteStringUTF8 b = ByteStringUTF8 (a `mappend` b)
{-# INLINE mappend #-}
instance MonoidNull ByteStringUTF8 where
null (ByteStringUTF8 b) = ByteString.null b
{-# INLINE null #-}
instance LeftReductiveMonoid ByteStringUTF8 where
stripPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = fmap ByteStringUTF8 (stripPrefix a b)
{-# INLINE stripPrefix #-}
ByteStringUTF8 a `isPrefixOf` ByteStringUTF8 b = a `isPrefixOf` b
{-# INLINE isPrefixOf #-}
instance LeftCancellativeMonoid ByteStringUTF8
instance LeftGCDMonoid ByteStringUTF8 where
commonPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = ByteStringUTF8 (commonPrefix a b)
{-# INLINE commonPrefix #-}
stripCommonPrefix (ByteStringUTF8 a) (ByteStringUTF8 b) = wrapTriple (stripCommonPrefix a b)
{-# INLINE stripCommonPrefix #-}
instance Show ByteStringUTF8 where
showsPrec _ bs s0 = '"' : Textual.foldr showsBytes showsChar ('"' : s0) bs
where showsBytes (ByteStringUTF8 b) s = '\\' : shows (ByteString.unpack b) s
showsChar c s
| isPrint c = c : s
| h:_ <- s, isDigit h = "\\" ++ show (ord c) ++ "\\&" ++ s
| otherwise = "\\" ++ show (ord c) ++ s
instance IsString ByteStringUTF8 where
fromString = ByteStringUTF8 . Foldable.foldMap fromChar
{-# INLINE fromString #-}
instance PositiveMonoid ByteStringUTF8
instance FactorialMonoid ByteStringUTF8 where
splitPrimePrefix utf8@(ByteStringUTF8 bs)
| ByteString.null bs = Nothing
| unsafeHead bs < 0x80 = Just (wrapPair $ ByteString.splitAt 1 bs)
| otherwise = case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Just i -> Just (wrapPair $ ByteString.splitAt (succ i) bs)
Nothing -> Just (utf8, ByteStringUTF8 $ ByteString.empty)
{-# INLINABLE splitPrimePrefix #-}
splitPrimeSuffix (ByteStringUTF8 bs)
| ByteString.null bs = Nothing
| ByteString.null prefix = Just (wrapPair splitBS)
| not (ByteString.null suffix) && ByteString.last prefix < 0x80 = Just (wrapPair splitBS)
| otherwise = Just (wrapPair $ ByteString.splitAt (pred $ ByteString.length prefix) bs)
where splitBS@(prefix, suffix) = ByteString.breakEnd byteStartsCharacter bs
{-# INLINABLE splitPrimeSuffix #-}
primePrefix utf8@(ByteStringUTF8 bs)
| ByteString.null bs = utf8
| unsafeHead bs < 0x80 = ByteStringUTF8 (ByteString.take 1 bs)
| otherwise = case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Just i -> ByteStringUTF8 (ByteString.take (succ i) bs)
Nothing -> utf8
{-# INLINABLE primePrefix #-}
factors (ByteStringUTF8 bs) = List.map ByteStringUTF8 $ ByteString.groupBy continued bs
where continued a b = a >= 0x80 && b >= 0x80 && b < 0xC0
{-# INLINABLE factors #-}
length (ByteStringUTF8 bs) = fst (ByteString.foldl' count (0, False) bs)
where count (n, high) byte | byte < 0x80 = (succ n, False)
| byte < 0xC0 = (if high then n else succ n, True)
| otherwise = (succ n, True)
{-# INLINABLE length #-}
foldl f a0 (ByteStringUTF8 bs) = List.foldl f' a0 (groupASCII bs)
where f' a b | unsafeHead b < 0x80 = ByteString.foldl f'' a b
| otherwise = f a (ByteStringUTF8 b)
f'' a w = f a (ByteStringUTF8 $ ByteString.singleton w)
{-# INLINABLE foldl #-}
foldl' f a0 (ByteStringUTF8 bs) = List.foldl' f' a0 (groupASCII bs)
where f' a b | unsafeHead b < 0x80 = ByteString.foldl' f'' a b
| otherwise = f a (ByteStringUTF8 b)
f'' a w = f a (ByteStringUTF8 $ ByteString.singleton w)
{-# INLINABLE foldl' #-}
foldr f a0 (ByteStringUTF8 bs) = List.foldr f' a0 (groupASCII bs)
where f' b a | unsafeHead b < 0x80 = ByteString.foldr f'' a b
| otherwise = f (ByteStringUTF8 b) a
f'' w a = f (ByteStringUTF8 $ ByteString.singleton w) a
{-# INLINABLE foldr #-}
splitAt n (ByteStringUTF8 bs) = wrapPair (ByteString.splitAt (charStartIndex n bs) bs)
{-# INLINE splitAt #-}
take n (ByteStringUTF8 bs) = ByteStringUTF8 (ByteString.take (charStartIndex n bs) bs)
{-# INLINE take #-}
drop n (ByteStringUTF8 bs) = ByteStringUTF8 (ByteString.drop (charStartIndex n bs) bs)
{-# INLINE drop #-}
dropWhile p (ByteStringUTF8 bs0) = dropASCII bs0
where dropASCII bs =
let suffix = ByteString.dropWhile (\w-> w < 0x80 && p (ByteStringUTF8 $ ByteString.singleton w)) bs
in if ByteString.null suffix || unsafeHead suffix < 0x80
then ByteStringUTF8 suffix
else dropMultiByte suffix
dropMultiByte bs =
let utf8 = ByteStringUTF8 bs
in case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Nothing -> if p utf8 then ByteStringUTF8 ByteString.empty else utf8
Just i -> let (hd, tl) = ByteString.splitAt (succ i) bs
in if p (ByteStringUTF8 hd)
then dropASCII tl
else utf8
{-# INLINE dropWhile #-}
takeWhile p utf8@(ByteStringUTF8 bs) =
ByteStringUTF8 $ ByteString.take (ByteString.length bs - ByteString.length s) bs
where (ByteStringUTF8 s) = Factorial.dropWhile p utf8
{-# INLINE takeWhile #-}
span p utf8@(ByteStringUTF8 bs) =
(ByteStringUTF8 $ ByteString.take (ByteString.length bs - ByteString.length s) bs, suffix)
where suffix@(ByteStringUTF8 s) = Factorial.dropWhile p utf8
{-# INLINE span #-}
break p = Factorial.span (not . p)
{-# INLINE break #-}
spanMaybe s0 f (ByteStringUTF8 bs0) = (ByteStringUTF8 $ ByteString.take (ByteString.length bs0 - ByteString.length dropped) bs0,
ByteStringUTF8 dropped,
s')
where (dropped, s') = dropASCII s0 bs0
dropASCII s bs =
let suffix = ByteString.drop index bs
(index, s1) = ByteString.foldr f8 id bs (0, s)
f8 w cont (i, s2)
| w < 0x80, Just s3 <- f s2 (ByteStringUTF8 $ ByteString.singleton w) =
let i' = succ i :: Int in seq i' $ cont (i', s3)
| otherwise = (i, s2)
in if ByteString.null suffix || unsafeHead suffix < 0x80
then (suffix, s1)
else dropMultiByte s1 suffix
dropMultiByte s bs =
case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Nothing -> case f s (ByteStringUTF8 bs)
of Just s1 -> (ByteString.empty, s1)
Nothing -> (bs, s)
Just i -> let (hd, tl) = ByteString.splitAt (succ i) bs
in case f s (ByteStringUTF8 hd)
of Just s1 -> dropASCII s1 tl
Nothing -> (bs, s)
{-# INLINE spanMaybe #-}
spanMaybe' s0 f (ByteStringUTF8 bs0) = (ByteStringUTF8 $
ByteString.take (ByteString.length bs0 - ByteString.length dropped) bs0,
ByteStringUTF8 dropped,
s')
where (dropped, s') = dropASCII s0 bs0
dropASCII s bs =
let suffix = ByteString.drop index bs
(index, s1) = ByteString.foldr f8 id bs (0, s)
f8 w cont (i, s2)
| w < 0x80, Just s3 <- f s2 (ByteStringUTF8 $ ByteString.singleton w) =
let i' = succ i :: Int in seq i' $ seq s3 $ cont (i', s3)
| otherwise = (i, s)
in if ByteString.null suffix || unsafeHead suffix < 0x80
then (suffix, s1)
else dropMultiByte s1 suffix
dropMultiByte s bs =
case ByteString.findIndex byteStartsCharacter (unsafeTail bs)
of Nothing -> case f s (ByteStringUTF8 bs)
of Just s1 -> seq s1 (ByteString.empty, s1)
Nothing -> (bs, s)
Just i -> let (hd, tl) = ByteString.splitAt (succ i) bs
in case f s (ByteStringUTF8 hd)
of Just s1 -> seq s1 (dropASCII s1 tl)
Nothing -> (bs, s)
{-# INLINE spanMaybe' #-}
reverse (ByteStringUTF8 bs) =
ByteStringUTF8 (ByteString.concat $ List.reverse $ List.map reverseASCII $ groupASCII bs)
where reverseASCII b | unsafeHead b < 0x80 = ByteString.reverse b
| otherwise = b
{-# INLINABLE reverse #-}
instance TextualMonoid ByteStringUTF8 where
singleton = ByteStringUTF8 . fromChar
{-# INLINE singleton #-}
splitCharacterPrefix (ByteStringUTF8 bs) = ByteString.uncons bs >>= uncurry toChar
{-# INLINE splitCharacterPrefix #-}
foldl ft fc a0 (ByteStringUTF8 bs) = case ByteString.Char8.foldl f (a0, []) bs
of (a, []) -> a
(a, acc) -> multiByte a acc
where f (a, []) c | c < '\x80' = (fc a c, [])
| otherwise = (a, [fromIntegral $ ord c])
f (a, acc) c | c < '\x80' = (fc (multiByte a acc) c, [])
| c < '\xC0' = (a, fromIntegral (ord c) : acc)
| otherwise = (multiByte a acc, [fromIntegral $ ord c])
multiByte a acc = reverseBytesToChar (ft a . ByteStringUTF8) (fc a) acc
{-# INLINE foldl #-}
foldl' ft fc a0 (ByteStringUTF8 bs) = case ByteString.Char8.foldl' f (a0, []) bs
of (a, []) -> a
(a, acc) -> multiByte a acc
where f (a, []) c | c < '\x80' = (fc a c, [])
| otherwise = seq a (a, [fromIntegral $ ord c])
f (a, acc) c | seq a c < '\x80' = let a' = multiByte a acc in seq a' (fc a' c, [])
| c < '\xC0' = (a, fromIntegral (ord c) : acc)
| otherwise = let a' = multiByte a acc in seq a' (a', [fromIntegral $ ord c])
multiByte a acc = reverseBytesToChar (ft a . ByteStringUTF8) (fc a) acc
{-# INLINE foldl' #-}
foldr ft fc a0 (ByteStringUTF8 bs) = case ByteString.Char8.foldr f (a0, []) bs
of (a, []) -> a
(a, acc) -> multiByte a acc
where f c (a, []) | c < '\x80' = (fc c a, [])
| c < '\xC0' = (a, [fromIntegral $ ord c])
| otherwise = (ft (ByteStringUTF8 $ ByteString.Char8.singleton c) a, [])
f c (a, acc) | c < '\x80' = (fc c (ft (ByteStringUTF8 $ ByteString.pack acc) a), [])
| c < '\xC0' = (a, fromIntegral (ord c) : acc)
| otherwise = (multiByte a (fromIntegral (ord c) : acc), [])
multiByte a acc = bytesToChar ((`ft` a) . ByteStringUTF8) (`fc` a) acc
{-# INLINE foldr #-}
dropWhile pb pc (ByteStringUTF8 bs) = ByteStringUTF8 $ dropASCII bs
where dropASCII rest = case ByteString.Char8.findIndex (\c-> c > '\x7f' || not (pc c)) rest
of Nothing -> ByteString.empty
Just j -> let rest' = unsafeDrop j rest
in if unsafeHead rest' > 0x7f
then dropMultiByte rest'
else rest'
dropMultiByte rest = case splitCharacterPrefix (ByteStringUTF8 rest)
of Just (c, ByteStringUTF8 rest') | pc c -> dropASCII rest'
Nothing -> let j = succ (headIndex $ drop 1 rest)
in if pb (ByteStringUTF8 $ ByteString.take j rest)
then dropASCII (unsafeDrop j rest)
else rest
_ -> rest
{-# INLINE dropWhile #-}
takeWhile pb pc utf8@(ByteStringUTF8 bs) =
ByteStringUTF8 $ unsafeTake (ByteString.length bs - ByteString.length suffix) bs
where ByteStringUTF8 suffix = Textual.dropWhile pb pc utf8
{-# INLINE takeWhile #-}
span pb pc utf8@(ByteStringUTF8 bs) = (ByteStringUTF8 $ unsafeTake (ByteString.length bs - ByteString.length suffix') bs, suffix)
where suffix@(ByteStringUTF8 suffix') = Textual.dropWhile pb pc utf8
{-# INLINE span #-}
break pb pc = Textual.span (not . pb) (not . pc)
{-# INLINE break #-}
spanMaybe s0 ft fc (ByteStringUTF8 bs) =
let inner i s
| i < len =
let w = unsafeIndex bs i
in if w < 0x80
then case fc s (w2c w)
of Just s' -> inner (i + 1) s'
Nothing -> done i s
else case splitCharacterPrefix (ByteStringUTF8 $ unsafeDrop i bs)
of Just (c, ByteStringUTF8 rest) | Just s' <- fc s c -> inner (len - ByteString.length rest) s'
Nothing -> let j = succ (headIndex $ drop (i + 1) bs)
in case ft s (ByteStringUTF8 $ ByteString.take j $ unsafeDrop i bs)
of Just s' -> inner (i + j) s'
Nothing -> done i s
_ -> done i s
| otherwise = done i s
done i s = i `seq` s `seq` (ByteStringUTF8 $ unsafeTake i bs, ByteStringUTF8 $ unsafeDrop i bs, s)
len = ByteString.length bs
in inner 0 s0
{-# INLINE spanMaybe #-}
spanMaybe' s0 ft fc (ByteStringUTF8 bs) =
let inner i s
| i < len =
s `seq`
let w = unsafeIndex bs i
in if w < 0x80
then case fc s (w2c w)
of Just s' -> inner (i + 1) s'
Nothing -> done i s
else case splitCharacterPrefix (ByteStringUTF8 $ unsafeDrop i bs)
of Just (c, ByteStringUTF8 rest) | Just s' <- fc s c -> inner (len - ByteString.length rest) s'
Nothing -> let j = succ (headIndex $ drop (i + 1) bs)
in case ft s (ByteStringUTF8 $ ByteString.take j $ unsafeDrop i bs)
of Just s' -> inner (i + j) s'
Nothing -> done i s
_ -> done i s
| otherwise = done i s
done i s = i `seq` s `seq` (ByteStringUTF8 $ unsafeTake i bs, ByteStringUTF8 $ unsafeDrop i bs, s)
len = ByteString.length bs
in inner 0 s0
{-# INLINE spanMaybe' #-}
find p (ByteStringUTF8 bs0) = loop bs0
where loop bs = case ByteString.Char8.findIndex (\c-> c >= '\x80' || p c) bs
of Nothing -> Nothing
Just i -> let x = unsafeIndex bs i
bs' = unsafeDrop (i + 1) bs
in if x < 0x80
then Just (w2c x)
else case toChar x bs'
of Just (c, ByteStringUTF8 rest) | p c -> Just c
| otherwise -> loop rest
Nothing -> loop (ByteString.dropWhile (not . byteStartsCharacter) bs')
{-# INLINE find #-}
any p utf8 = isJust (find p utf8)
{-# INLINE any #-}
all p utf8 = isNothing (find (not . p) utf8)
{-# INLINE all #-}
elem c utf8@(ByteStringUTF8 bs)
| c < '\x80' = ByteString.Char8.elem c bs
| otherwise = any (== c) utf8
{-# INLINE elem #-}
reverseBytesToChar :: (ByteString -> a) -> (Char -> a) -> [Word8] -> a
reverseBytesToChar ft fc [w] = if w < 0x80 then fc (w2c w) else ft (ByteString.singleton w)
reverseBytesToChar ft fc [b0, b1] =
assert (0x80 <= b0 && b0 < 0xC0) $
if 0xC2 <= b1 && b1 < 0xE0
then fc (chr (shiftL (fromIntegral b1 .&. 0x1F) 6 .|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack [b1, b0])
reverseBytesToChar ft fc [b0, b1, b2] =
assert (0x80 <= b0 && b0 < 0xC0 && 0x80 <= b1 && b1 < 0xC0) $
if (0xE0 < b2 || 0xE0 == b2 && 0xA0 <= b1) && b2 < 0xF0
then fc (chr (shiftL (fromIntegral b2 .&. 0xF) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack [b2, b1, b0])
reverseBytesToChar ft fc [b0, b1, b2, b3] =
assert (0x80 <= b0 && b0 < 0xC0 && 0x80 <= b1 && b1 < 0xC0 && 0x80 <= b2 && b2 < 0xC0) $
if (0xF0 < b3 || 0xF0 == b3 && 0x90 <= b2) && b3 < 0xF5 && (b3 < 0xF4 || b2 < 0x90)
then fc (chr (shiftL (fromIntegral b3 .&. 0x7) 18
.|. shiftL (fromIntegral b2 .&. 0x3F) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack [b3, b2, b1, b0])
reverseBytesToChar ft _fc bytes = ft (ByteString.reverse $ ByteString.pack bytes)
bytesToChar :: (ByteString -> a) -> (Char -> a) -> [Word8] -> a
bytesToChar ft fc [w] = if w < 0x80 then fc (w2c w) else ft (ByteString.singleton w)
bytesToChar ft fc bytes@[b1, b0] =
assert (0x80 <= b0 && b0 < 0xC0) $
if 0xC2 <= b1 && b1 < 0xE0
then fc (chr (shiftL (fromIntegral b1 .&. 0x1F) 6 .|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack bytes)
bytesToChar ft fc bytes@[b2, b1, b0] =
assert (0x80 <= b0 && b0 < 0xC0 && 0x80 <= b1 && b1 < 0xC0) $
if (0xE0 < b2 || 0xE0 == b2 && 0xA0 <= b1) && b2 < 0xF0
then fc (chr (shiftL (fromIntegral b2 .&. 0xF) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack bytes)
bytesToChar ft fc bytes@[b3, b2, b1, b0] =
assert (0x80 <= b0 && b0 < 0xC0 && 0x80 <= b1 && b1 < 0xC0 && 0x80 <= b2 && b2 < 0xC0) $
if (0xF0 < b3 || 0xF0 == b3 && 0x90 <= b2) && b3 < 0xF5 && (b3 < 0xF4 || b2 < 0x90)
then fc (chr (shiftL (fromIntegral b3 .&. 0x7) 18
.|. shiftL (fromIntegral b2 .&. 0x3F) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F))
else ft (ByteString.pack bytes)
bytesToChar ft _fc bytes = ft (ByteString.pack bytes)
wrapPair :: (ByteString, ByteString) -> (ByteStringUTF8, ByteStringUTF8)
wrapPair (bs1, bs2) = (ByteStringUTF8 bs1, ByteStringUTF8 bs2)
{-# INLINE wrapPair #-}
wrapTriple :: (ByteString, ByteString, ByteString) -> (ByteStringUTF8, ByteStringUTF8, ByteStringUTF8)
wrapTriple (bs1, bs2, bs3) = (ByteStringUTF8 bs1, ByteStringUTF8 bs2, ByteStringUTF8 bs3)
{-# INLINE wrapTriple #-}
fromChar :: Char -> ByteString
fromChar c | c < '\x80' = ByteString.Char8.singleton c
| c < '\x800' = ByteString.pack [0xC0 + fromIntegral (shiftR n 6),
0x80 + fromIntegral (n .&. 0x3F)]
| c < '\x10000' = ByteString.pack [0xE0 + fromIntegral (shiftR n 12),
0x80 + fromIntegral (shiftR n 6 .&. 0x3F),
0x80 + fromIntegral (n .&. 0x3F)]
| n < 0x200000 = ByteString.pack [0xF0 + fromIntegral (shiftR n 18),
0x80 + fromIntegral (shiftR n 12 .&. 0x3F),
0x80 + fromIntegral (shiftR n 6 .&. 0x3F),
0x80 + fromIntegral (n .&. 0x3F)]
| otherwise = error ("Data.Char.ord '" ++ (c : "' >=0x200000"))
where n = ord c
toChar :: Word8 -> ByteString -> Maybe (Char, ByteStringUTF8)
toChar hd tl | hd < 0x80 = Just (w2c hd, ByteStringUTF8 tl)
| hd < 0xC2 = Nothing
| hd < 0xE0 = do (b0, t0) <- ByteString.uncons tl
if headIndex tl == 1
then return (chr (shiftL (fromIntegral hd .&. 0x1F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| hd < 0xF0 = do (b1, t1) <- ByteString.uncons tl
(b0, t0) <- ByteString.uncons t1
if (hd > 0xE0 || b1 >= 0xA0) && headIndex tl == 2
then return (chr (shiftL (fromIntegral hd .&. 0xF) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| hd < 0xF5 = do (b2, t2) <- ByteString.uncons tl
(b1, t1) <- ByteString.uncons t2
(b0, t0) <- ByteString.uncons t1
if (hd > 0xF0 || b2 >= 0x90) && (hd < 0xF4 || b2 < 0x90) && headIndex tl == 3
then return (chr (shiftL (fromIntegral hd .&. 0x7) 18
.|. shiftL (fromIntegral b2 .&. 0x3F) 12
.|. shiftL (fromIntegral b1 .&. 0x3F) 6
.|. fromIntegral b0 .&. 0x3F),
ByteStringUTF8 t0)
else Nothing
| otherwise = Nothing
groupASCII :: ByteString -> [ByteString]
groupASCII = ByteString.groupBy continued
where continued a b = (a < 0x80) == (b < 0x80) && b < 0xC0
{-# INLINE groupASCII #-}
headIndex :: ByteString -> Int
headIndex bs = fromMaybe (ByteString.length bs) $ ByteString.findIndex byteStartsCharacter bs
{-# INLINE headIndex #-}
byteStartsCharacter :: Word8 -> Bool
byteStartsCharacter b = b < 0x80 || b >= 0xC0
{-# INLINE byteStartsCharacter #-}
charStartIndex :: Int -> ByteString -> Int
charStartIndex n _ | n <= 0 = 0
charStartIndex n0 bs = ByteString.foldr count (const $ ByteString.length bs) bs (n0, False, 0)
where count byte _ (0, high, i) | byte < 0x80 || byte >= 0xC0 || not high = i
count byte cont (n, high, i) | byte < 0x80 = cont (pred n, False, succ i)
| byte < 0xC0 = cont (if high then n else pred n, True, succ i)
| otherwise = cont (pred n, True, succ i)
{-# INLINE charStartIndex #-}