{-# LANGUAGE CPP #-}
{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples, NoImplicitPrelude #-}
{-# OPTIONS_GHC -O2 -fno-warn-name-shadowing #-}

-- |
-- Module      :  GHC.Internal.Encoding.UTF8
-- Copyright   :  (c) The University of Glasgow, 1994-2023
-- License     :  see libraries/base/LICENSE
--
-- Maintainer  :  ghc-devs@haskell.org
-- Stability   :  internal
-- Portability :  non-portable (GHC extensions)
--
-- /The API of this module is unstable and not meant to be consumed by the general public./
-- If you absolutely must depend on it, make sure to use a tight upper
-- bound, e.g., @base < 4.X@ rather than @base < 5@, because the interface can
-- change rapidly without much warning.
--
-- Simple UTF-8 codecs supporting non-streaming encoding/decoding.
-- For encoding where codepoints may be broken across buffers,
-- see "GHC.IO.Encoding.UTF8".
--
-- This is one of several UTF-8 implementations provided by GHC; see Note
-- [GHC's many UTF-8 implementations] in "GHC.Encoding.UTF8" for an
-- overview.
--
module GHC.Internal.Encoding.UTF8
    ( -- * Decoding single characters
      utf8DecodeCharAddr#
    , utf8DecodeCharPtr
    , utf8DecodeCharByteArray#
      -- * Decoding strings
    , utf8DecodeByteArray#
    , utf8DecodeForeignPtr
      -- * Counting characters
    , utf8CountCharsByteArray#
      -- * Comparison
    , utf8CompareByteArray#
      -- * Encoding strings
    , utf8EncodePtr
    , utf8EncodeByteArray#
    , utf8EncodedLength
    ) where

import GHC.Types
import GHC.Internal.Base
import GHC.Internal.IO
import GHC.Internal.ST
import GHC.Internal.Word
import GHC.Internal.ForeignPtr
import GHC.Internal.Num
import GHC.Internal.Bits
import GHC.Internal.Real
import GHC.Internal.Ptr

{-
Note [GHC's many UTF-8 implementations]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Currently GHC ships with at least five UTF-8 implementations:

a. the implementation used by GHC in `ghc-boot:GHC.Utils.Encoding`; this can be
   used at a number of types including `Addr#`, `ByteArray#`, `ForeignPtr`,
   `Ptr`, `ShortByteString`, and `ByteString`. Most of this can be removed in
   GHC 9.6+2, when the copies in `base` will become available to `ghc-boot`.

b. the copy of the `ghc-boot` definition now exported by `base:GHC.Encoding.UTF8`.
   This can be used at `Addr#`, `Ptr`, `ByteArray#`, and `ForeignPtr`.

c. the decoder used by `unpackCStringUtf8#` in `ghc-prim:GHC.CString`; this is
   specialised at `Addr#`.

d. the codec used by the IO subsystem in `base:GHC.IO.Encoding.UTF8`; this is
   specialised at `Addr#` but, unlike the above, supports recovery in the presence
   of partial codepoints (since in IO contexts codepoints may be broken across
   buffers)

e. the implementation provided by the `text` library

On its face, this seems a tad silly. On the other hand, these implementations do
materially differ from one another (e.g. in the types they support, the
detail in errors they can report, and the ability to recover from partial
codepoints). Consequently, it's quite unclear that further consolidation
would be worthwhile.

The most obvious opportunity is to move (b) into `ghc-prim` and use it to
implement (c) (namely `unpackCStringUtf8#` and friends). However, it's not
clear that this would be worthwhile as several of the types supported by (b)
are defined in `base`.
-}

-- We can't write the decoder as efficiently as we'd like without
-- resorting to unboxed extensions, unfortunately.  I tried to write
-- an IO version of this function, but GHC can't eliminate boxed
-- results from an IO-returning function.
--
-- We assume we can ignore overflow when parsing a multibyte character here.
-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
-- before decoding them (see "GHC.Data.StringBuffer").

{-# INLINE utf8DecodeChar# #-}
-- | Decode a single codepoint from a byte buffer indexed by the given indexing
-- function.
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# Int# -> Word#
indexWord8# =
  let !ch0 :: Int#
ch0 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
0#) in
  case () of
    ()
_ | Int# -> Bool
isTrue# (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0x7F#) -> (# Int# -> Char#
chr# Int#
ch0, Int#
1# #)

      | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xC0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xDF#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xC0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#) Int# -> Int# -> Int#
+#
                  (Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
2# #)

      | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xE0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xEF#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        let !ch2 :: Int#
ch2 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
2#) in
        if Int# -> Bool
isTrue# ((Int#
ch2 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch2 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
2# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xE0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
12#) Int# -> Int# -> Int#
+#
                 ((Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#)  Int# -> Int# -> Int#
+#
                  (Int#
ch2 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
3# #)

     | Int# -> Bool
isTrue# ((Int#
ch0 Int# -> Int# -> Int#
>=# Int#
0xF0#) Int# -> Int# -> Int#
`andI#` (Int#
ch0 Int# -> Int# -> Int#
<=# Int#
0xF8#)) ->
        let !ch1 :: Int#
ch1 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
1#) in
        if Int# -> Bool
isTrue# ((Int#
ch1 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch1 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
1# else
        let !ch2 :: Int#
ch2 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
2#) in
        if Int# -> Bool
isTrue# ((Int#
ch2 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch2 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
2# else
        let !ch3 :: Int#
ch3 = Word# -> Int#
word2Int# (Int# -> Word#
indexWord8# Int#
3#) in
        if Int# -> Bool
isTrue# ((Int#
ch3 Int# -> Int# -> Int#
<# Int#
0x80#) Int# -> Int# -> Int#
`orI#` (Int#
ch3 Int# -> Int# -> Int#
>=# Int#
0xC0#)) then Int# -> (# Char#, Int# #)
fail Int#
3# else
        (# Int# -> Char#
chr# (((Int#
ch0 Int# -> Int# -> Int#
-# Int#
0xF0#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
18#) Int# -> Int# -> Int#
+#
                 ((Int#
ch1 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
12#) Int# -> Int# -> Int#
+#
                 ((Int#
ch2 Int# -> Int# -> Int#
-# Int#
0x80#) Int# -> Int# -> Int#
`uncheckedIShiftL#` Int#
6#)  Int# -> Int# -> Int#
+#
                  (Int#
ch3 Int# -> Int# -> Int#
-# Int#
0x80#)),
           Int#
4# #)

      | Bool
otherwise -> Int# -> (# Char#, Int# #)
fail Int#
1#
  where
        -- all invalid sequences end up here:
        fail :: Int# -> (# Char#, Int# #)
        fail :: Int# -> (# Char#, Int# #)
fail Int#
nBytes# = (# Char#
'\0'#, Int#
nBytes# #)
        -- '\xFFFD' would be the usual replacement character, but
        -- that's a valid symbol in Haskell, so will result in a
        -- confusing parse error later on.  Instead we use '\0' which
        -- will signal a lexer error immediately.

-- | Decode a single character at the given 'Addr#'.
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# :: Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a# Int#
off# =
#if !MIN_VERSION_ghc_prim(0,10,0)
    utf8DecodeChar# (\i# -> indexWord8OffAddr# a# (i# +# off#))
#else
    (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# (\Int#
i# -> Word8# -> Word#
word8ToWord# (Addr# -> Int# -> Word8#
indexWord8OffAddr# Addr#
a# (Int#
i# Int# -> Int# -> Int#
+# Int#
off#)))
#endif

-- | Decode a single codepoint starting at the given 'Ptr'.
utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr :: Ptr Word8 -> (Char, Int)
utf8DecodeCharPtr !(Ptr Addr#
a#) =
  case Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a# Int#
0# of
    (# Char#
c#, Int#
nBytes# #) -> ( Char# -> Char
C# Char#
c#, Int# -> Int
I# Int#
nBytes# )

-- | Decode a single codepoint starting at the given byte offset into a
-- 'ByteArray#'.
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba# Int#
off# =
#if !MIN_VERSION_ghc_prim(0,10,0)
    utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
#else
    (Int# -> Word#) -> (# Char#, Int# #)
utf8DecodeChar# (\Int#
i# -> Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
ba# (Int#
i# Int# -> Int# -> Int#
+# Int#
off#)))
#endif

{-# INLINE utf8Decode# #-}
utf8Decode# :: (IO ()) -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# :: IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# IO ()
retain Int# -> (# Char#, Int# #)
decodeChar# Int#
len#
  = Int# -> IO [Char]
unpack Int#
0#
  where
    unpack :: Int# -> IO [Char]
unpack Int#
i#
        | Int# -> Bool
isTrue# (Int#
i# Int# -> Int# -> Int#
>=# Int#
len#) = IO ()
retain IO () -> IO [Char] -> IO [Char]
forall a b. IO a -> IO b -> IO b
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> [Char] -> IO [Char]
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return []
        | Bool
otherwise =
            case Int# -> (# Char#, Int# #)
decodeChar# Int#
i# of
              (# Char#
c#, Int#
nBytes# #) -> do
                rest <- IO [Char] -> IO [Char]
forall a. IO a -> IO a
unsafeDupableInterleaveIO (IO [Char] -> IO [Char]) -> IO [Char] -> IO [Char]
forall a b. (a -> b) -> a -> b
$ Int# -> IO [Char]
unpack (Int#
i# Int# -> Int# -> Int#
+# Int#
nBytes#)
                return (C# c# : rest)

utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeForeignPtr ForeignPtr Word8
fp Int
offset (I# Int#
len#)
  = IO [Char] -> [Char]
forall a. IO a -> a
unsafeDupablePerformIO (IO [Char] -> [Char]) -> IO [Char] -> [Char]
forall a b. (a -> b) -> a -> b
$ do
      let !(Ptr Addr#
a#) = ForeignPtr Word8 -> Ptr Word8
forall a. ForeignPtr a -> Ptr a
unsafeForeignPtrToPtr ForeignPtr Word8
fp Ptr Word8 -> Int -> Ptr Any
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
offset
      IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# (ForeignPtr Word8 -> IO ()
forall a. ForeignPtr a -> IO ()
touchForeignPtr ForeignPtr Word8
fp) (Addr# -> Int# -> (# Char#, Int# #)
utf8DecodeCharAddr# Addr#
a#) Int#
len#
-- Note that since utf8Decode# returns a thunk the lifetime of the
-- ForeignPtr actually needs to be longer than the lexical lifetime
-- withForeignPtr would provide here. That's why we use touchForeignPtr to
-- keep the fp alive until the last character has actually been decoded.

utf8DecodeByteArray# :: ByteArray# -> [Char]
utf8DecodeByteArray# :: ByteArray# -> [Char]
utf8DecodeByteArray# ByteArray#
ba#
  = IO [Char] -> [Char]
forall a. IO a -> a
unsafeDupablePerformIO (IO [Char] -> [Char]) -> IO [Char] -> [Char]
forall a b. (a -> b) -> a -> b
$
      let len# :: Int#
len# = ByteArray# -> Int#
sizeofByteArray# ByteArray#
ba# in
      IO () -> (Int# -> (# Char#, Int# #)) -> Int# -> IO [Char]
utf8Decode# (() -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba#) Int#
len#

utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# :: ByteArray# -> ByteArray# -> Ordering
utf8CompareByteArray# ByteArray#
a1 ByteArray#
a2 = Int# -> Int# -> Ordering
go Int#
0# Int#
0#
   -- UTF-8 has the property that sorting by bytes values also sorts by
   -- code-points.
   -- BUT we use "Modified UTF-8" which encodes \0 as 0xC080 so this property
   -- doesn't hold and we must explicitly check this case here.
   -- Note that decoding every code point would also work but it would be much
   -- more costly.
   where
       !sz1 :: Int#
sz1 = ByteArray# -> Int#
sizeofByteArray# ByteArray#
a1
       !sz2 :: Int#
sz2 = ByteArray# -> Int#
sizeofByteArray# ByteArray#
a2
       go :: Int# -> Int# -> Ordering
go Int#
off1 Int#
off2
         | Int# -> Bool
isTrue# ((Int#
off1 Int# -> Int# -> Int#
>=# Int#
sz1) Int# -> Int# -> Int#
`andI#` (Int#
off2 Int# -> Int# -> Int#
>=# Int#
sz2)) = Ordering
EQ
         | Int# -> Bool
isTrue# (Int#
off1 Int# -> Int# -> Int#
>=# Int#
sz1)                          = Ordering
LT
         | Int# -> Bool
isTrue# (Int#
off2 Int# -> Int# -> Int#
>=# Int#
sz2)                          = Ordering
GT
         | Bool
otherwise =
#if !MIN_VERSION_ghc_prim(0,10,0)
               let !b1_1 = indexWord8Array# a1 off1
                   !b2_1 = indexWord8Array# a2 off2
#else
               let !b1_1 :: Word#
b1_1 = Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a1 Int#
off1)
                   !b2_1 :: Word#
b2_1 = Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a2 Int#
off2)
#endif
               in case Word#
b1_1 of
                  Word#
0xC0## -> case Word#
b2_1 of
                     Word#
0xC0## -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
#if !MIN_VERSION_ghc_prim(0,10,0)
                     _      -> case indexWord8Array# a1 (off1 +# 1#) of
#else
                     Word#
_      -> case Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a1 (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#)) of
#endif
                        Word#
0x80## -> Ordering
LT
                        Word#
_      -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
                  Word#
_      -> case Word#
b2_1 of
#if !MIN_VERSION_ghc_prim(0,10,0)
                     0xC0## -> case indexWord8Array# a2 (off2 +# 1#) of
#else
                     Word#
0xC0## -> case Word8# -> Word#
word8ToWord# (ByteArray# -> Int# -> Word8#
indexWord8Array# ByteArray#
a2 (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)) of
#endif
                        Word#
0x80## -> Ordering
GT
                        Word#
_      -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)
                     Word#
_   | Int# -> Bool
isTrue# (Word#
b1_1 Word# -> Word# -> Int#
`gtWord#` Word#
b2_1) -> Ordering
GT
                         | Int# -> Bool
isTrue# (Word#
b1_1 Word# -> Word# -> Int#
`ltWord#` Word#
b2_1) -> Ordering
LT
                         | Bool
otherwise                     -> Int# -> Int# -> Ordering
go (Int#
off1 Int# -> Int# -> Int#
+# Int#
1#) (Int#
off2 Int# -> Int# -> Int#
+# Int#
1#)

utf8CountCharsByteArray# :: ByteArray# -> Int
utf8CountCharsByteArray# :: ByteArray# -> Int
utf8CountCharsByteArray# ByteArray#
ba = Int# -> Int# -> Int
go Int#
0# Int#
0#
  where
    len# :: Int#
len# = ByteArray# -> Int#
sizeofByteArray# ByteArray#
ba
    go :: Int# -> Int# -> Int
go Int#
i# Int#
n#
      | Int# -> Bool
isTrue# (Int#
i# Int# -> Int# -> Int#
>=# Int#
len#) = Int# -> Int
I# Int#
n#
      | Bool
otherwise =
          case ByteArray# -> Int# -> (# Char#, Int# #)
utf8DecodeCharByteArray# ByteArray#
ba Int#
i# of
            (# Char#
_, Int#
nBytes# #) -> Int# -> Int# -> Int
go (Int#
i# Int# -> Int# -> Int#
+# Int#
nBytes#) (Int#
n# Int# -> Int# -> Int#
+# Int#
1#)

{-# INLINE utf8EncodeChar #-}
utf8EncodeChar :: (Int# -> Word8# -> State# s -> State# s)
               -> Char -> ST s Int
utf8EncodeChar :: forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar Int# -> Word8# -> State# s -> State# s
write# Char
c =
  let x :: Word
x = Int -> Word
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Char -> Int
ord Char
c) in
  case () of
    ()
_ | Word
x Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
> Word
0 Bool -> Bool -> Bool
&& Word
x Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0x007f -> do
          Int -> Word -> ST s ()
write Int
0 Word
x
          Int -> ST s Int
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
1
        -- NB. '\0' is encoded as '\xC0\x80', not '\0'.  This is so that we
        -- can have 0-terminated UTF-8 strings (see GHC.Internal.Base.unpackCStringUtf8).
      | Word
x Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0x07ff -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xC0 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. ((Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x1F))
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> ST s Int
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
2
      | Word
x Word -> Word -> Bool
forall a. Ord a => a -> a -> Bool
<= Word
0xffff -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xE0 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x0F)
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F)
          Int -> Word -> ST s ()
write Int
2 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> ST s Int
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
3
      | Bool
otherwise -> do
          Int -> Word -> ST s ()
write Int
0 (Word
0xF0 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
18))
          Int -> Word -> ST s ()
write Int
1 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. ((Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
12) Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> Word -> ST s ()
write Int
2 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. ((Word
x Word -> Int -> Word
forall a. Bits a => a -> Int -> a
`shiftR` Int
6) Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> Word -> ST s ()
write Int
3 (Word
0x80 Word -> Word -> Word
forall a. Bits a => a -> a -> a
.|. (Word
x Word -> Word -> Word
forall a. Bits a => a -> a -> a
.&. Word
0x3F))
          Int -> ST s Int
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return Int
4
  where
    {-# INLINE write #-}
    write :: Int -> Word -> ST s ()
write (I# Int#
off#) (W# Word#
c#) = STRep s () -> ST s ()
forall s a. STRep s a -> ST s a
ST (STRep s () -> ST s ()) -> STRep s () -> ST s ()
forall a b. (a -> b) -> a -> b
$ \State# s
s ->
#if !MIN_VERSION_ghc_prim(0,10,0)
      case write# off# (narrowWord8# c#) s of
#else
      case Int# -> Word8# -> State# s -> State# s
write# Int#
off# (Word# -> Word8#
wordToWord8# Word#
c#) State# s
s of
#endif
        State# s
s -> (# State# s
s, () #)

utf8EncodePtr :: Ptr Word8 -> String -> IO ()
utf8EncodePtr :: Ptr Word8 -> [Char] -> IO ()
utf8EncodePtr (Ptr Addr#
a#) [Char]
str = Addr# -> [Char] -> IO ()
go Addr#
a# [Char]
str
  where go :: Addr# -> [Char] -> IO ()
go !Addr#
_   []   = () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
        go Addr#
a# (Char
c:[Char]
cs) = do
#if !MIN_VERSION_ghc_prim(0,10,0)
          -- writeWord8OffAddr# was taking a Word#
          I# off# <- stToIO $ utf8EncodeChar (\i w -> writeWord8OffAddr# a# i (extendWord8# w)) c
#else
          I# off# <- ST RealWorld Int -> IO Int
forall a. ST RealWorld a -> IO a
stToIO (ST RealWorld Int -> IO Int) -> ST RealWorld Int -> IO Int
forall a b. (a -> b) -> a -> b
$ (Int# -> Word8# -> State# RealWorld -> State# RealWorld)
-> Char -> ST RealWorld Int
forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar (Addr# -> Int# -> Word8# -> State# RealWorld -> State# RealWorld
forall d. Addr# -> Int# -> Word8# -> State# d -> State# d
writeWord8OffAddr# Addr#
a#) Char
c
#endif
          go (a# `plusAddr#` off#) cs

utf8EncodeByteArray# :: String -> ByteArray#
utf8EncodeByteArray# :: [Char] -> ByteArray#
utf8EncodeByteArray# [Char]
str = (State# RealWorld -> ByteArray#) -> ByteArray#
forall o. (State# RealWorld -> o) -> o
runRW# ((State# RealWorld -> ByteArray#) -> ByteArray#)
-> (State# RealWorld -> ByteArray#) -> ByteArray#
forall a b. (a -> b) -> a -> b
$ \State# RealWorld
s ->
  case [Char] -> Int
utf8EncodedLength [Char]
str         of { I# Int#
len# ->
  case Int#
-> State# RealWorld
-> (# State# RealWorld, MutableByteArray# RealWorld #)
forall d. Int# -> State# d -> (# State# d, MutableByteArray# d #)
newByteArray# Int#
len# State# RealWorld
s          of { (# State# RealWorld
s, MutableByteArray# RealWorld
mba# #) ->
  case MutableByteArray# RealWorld -> Int# -> [Char] -> ST RealWorld ()
forall {s}. MutableByteArray# s -> Int# -> [Char] -> ST s ()
go MutableByteArray# RealWorld
mba# Int#
0# [Char]
str                of { ST STRep RealWorld ()
f_go ->
  case STRep RealWorld ()
f_go State# RealWorld
s                        of { (# State# RealWorld
s, () #) ->
  case MutableByteArray# RealWorld
-> State# RealWorld -> (# State# RealWorld, ByteArray# #)
forall d.
MutableByteArray# d -> State# d -> (# State# d, ByteArray# #)
unsafeFreezeByteArray# MutableByteArray# RealWorld
mba# State# RealWorld
s of { (# State# RealWorld
_, ByteArray#
ba# #) ->
  ByteArray#
ba# }}}}}
  where
    go :: MutableByteArray# s -> Int# -> [Char] -> ST s ()
go MutableByteArray# s
_ Int#
_ [] = () -> ST s ()
forall a. a -> ST s a
forall (m :: * -> *) a. Monad m => a -> m a
return ()
    go MutableByteArray# s
mba# Int#
i# (Char
c:[Char]
cs) = do
#if !MIN_VERSION_ghc_prim(0,10,0)
      -- writeWord8Array# was taking a Word#
      I# off# <- utf8EncodeChar (\j# w -> writeWord8Array# mba# (i# +# j#) (extendWord8# w)) c
#else
      I# off# <- (Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
forall s.
(Int# -> Word8# -> State# s -> State# s) -> Char -> ST s Int
utf8EncodeChar (\Int#
j# -> MutableByteArray# s -> Int# -> Word8# -> State# s -> State# s
forall d.
MutableByteArray# d -> Int# -> Word8# -> State# d -> State# d
writeWord8Array# MutableByteArray# s
mba# (Int#
i# Int# -> Int# -> Int#
+# Int#
j#)) Char
c
#endif
      go mba# (i# +# off#) cs

utf8EncodedLength :: String -> Int
utf8EncodedLength :: [Char] -> Int
utf8EncodedLength [Char]
str = Int -> [Char] -> Int
forall {t}. Num t => t -> [Char] -> t
go Int
0 [Char]
str
  where go :: t -> [Char] -> t
go !t
n [] = t
n
        go t
n (Char
c:[Char]
cs)
          | Char -> Int
ord Char
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0 Bool -> Bool -> Bool
&& Char -> Int
ord Char
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x007f = t -> [Char] -> t
go (t
nt -> t -> t
forall a. Num a => a -> a -> a
+t
1) [Char]
cs
          | Char -> Int
ord Char
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x07ff = t -> [Char] -> t
go (t
nt -> t -> t
forall a. Num a => a -> a -> a
+t
2) [Char]
cs
          | Char -> Int
ord Char
c Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0xffff = t -> [Char] -> t
go (t
nt -> t -> t
forall a. Num a => a -> a -> a
+t
3) [Char]
cs
          | Bool
otherwise       = t -> [Char] -> t
go (t
nt -> t -> t
forall a. Num a => a -> a -> a
+t
4) [Char]
cs