-- Alfred-Margaret: Fast Aho-Corasick string searching
-- Copyright 2022 Channable
--
-- Licensed under the 3-clause BSD license, see the LICENSE file in the
-- repository root.

{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}

-- | This module provides functions that allow treating 'Text' values as series of UTF-8 code units
-- instead of characters. Currently, it also contains a stub 'Text' type which treats its internal byte array
-- as UTF-8 encoded. We use this as a placeholder until we can use @text-2@.
module Data.Text.Utf8
    ( CodePoint
    , CodeUnit
    , CodeUnitIndex (..)
    , Text (..)
    , lengthUtf8
    , lowerCodePoint
    , lowerUtf8
    , toLowerAscii
    , unicode2utf8
    , unpackUtf8
      -- * Decoding
      --
      -- $decoding
    , decode2
    , decode3
    , decode4
    , decodeUtf8
    , stringToByteArray
      -- * Indexing
      --
      -- $indexing
    , indexCodeUnit
    , unsafeIndexCodePoint
    , unsafeIndexCodePoint'
    , unsafeIndexCodeUnit
    , unsafeIndexCodeUnit'
      -- * Slicing Functions
      --
      -- $slicingFunctions
    , unsafeCutUtf8
    , unsafeSliceUtf8
      -- * General Functions
      --
      -- $generalFunctions
    , Data.Text.Utf8.concat
    , Data.Text.Utf8.dropWhile
    , Data.Text.Utf8.null
    , Data.Text.Utf8.readFile
    , Data.Text.Utf8.replicate
    , indices
    , isInfixOf
    , pack
    , unpack
    ) where

import Control.DeepSeq (NFData, rnf)
import Data.Bits (Bits (shiftL), shiftR, (.&.), (.|.))
import Data.Char (ord)
import Data.Foldable (for_)
import Data.Hashable (Hashable (hashWithSalt), hashByteArrayWithSalt)
import Data.Primitive.ByteArray (ByteArray (ByteArray), byteArrayFromList, compareByteArrays,
                                 indexByteArray, newByteArray, sizeofByteArray,
                                 unsafeFreezeByteArray, writeByteArray)
import Data.String (IsString (fromString))
import Data.Word (Word8)
import GHC.Generics (Generic)
import Prelude hiding (length)
#if defined(HAS_AESON)
import Data.Aeson (FromJSON, ToJSON, Value (String), parseJSON, toJSON, withText)
#endif

import qualified Data.ByteString as BS
import qualified Data.Char as Char
import qualified Data.Text as T

-- | A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.
type CodeUnit = Word8

-- | A Unicode code point.
type CodePoint = Char

-- | An index into the raw UTF-8 data of a `Text`. This is not the code point
-- index as conventionally accepted by `Text`, so we wrap it to avoid confusing
-- the two. Incorrect index manipulation can lead to surrogate pairs being
-- sliced, so manipulate indices with care. This type is also used for lengths.
newtype CodeUnitIndex = CodeUnitIndex
    { CodeUnitIndex -> Int
codeUnitIndex :: Int
    }
    deriving stock (CodeUnitIndex -> CodeUnitIndex -> Bool
(CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool) -> Eq CodeUnitIndex
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c/= :: CodeUnitIndex -> CodeUnitIndex -> Bool
== :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c== :: CodeUnitIndex -> CodeUnitIndex -> Bool
Eq, Eq CodeUnitIndex
Eq CodeUnitIndex
-> (CodeUnitIndex -> CodeUnitIndex -> Ordering)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> Bool)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> Ord CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> Bool
CodeUnitIndex -> CodeUnitIndex -> Ordering
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmin :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$cmax :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c>= :: CodeUnitIndex -> CodeUnitIndex -> Bool
> :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c> :: CodeUnitIndex -> CodeUnitIndex -> Bool
<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c<= :: CodeUnitIndex -> CodeUnitIndex -> Bool
< :: CodeUnitIndex -> CodeUnitIndex -> Bool
$c< :: CodeUnitIndex -> CodeUnitIndex -> Bool
compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
$ccompare :: CodeUnitIndex -> CodeUnitIndex -> Ordering
$cp1Ord :: Eq CodeUnitIndex
Ord, Int -> CodeUnitIndex -> ShowS
[CodeUnitIndex] -> ShowS
CodeUnitIndex -> String
(Int -> CodeUnitIndex -> ShowS)
-> (CodeUnitIndex -> String)
-> ([CodeUnitIndex] -> ShowS)
-> Show CodeUnitIndex
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CodeUnitIndex] -> ShowS
$cshowList :: [CodeUnitIndex] -> ShowS
show :: CodeUnitIndex -> String
$cshow :: CodeUnitIndex -> String
showsPrec :: Int -> CodeUnitIndex -> ShowS
$cshowsPrec :: Int -> CodeUnitIndex -> ShowS
Show, (forall x. CodeUnitIndex -> Rep CodeUnitIndex x)
-> (forall x. Rep CodeUnitIndex x -> CodeUnitIndex)
-> Generic CodeUnitIndex
forall x. Rep CodeUnitIndex x -> CodeUnitIndex
forall x. CodeUnitIndex -> Rep CodeUnitIndex x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
$cto :: forall x. Rep CodeUnitIndex x -> CodeUnitIndex
$cfrom :: forall x. CodeUnitIndex -> Rep CodeUnitIndex x
Generic, CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> Bounded CodeUnitIndex
forall a. a -> a -> Bounded a
maxBound :: CodeUnitIndex
$cmaxBound :: CodeUnitIndex
minBound :: CodeUnitIndex
$cminBound :: CodeUnitIndex
Bounded)
#if defined(HAS_AESON)
    deriving newtype (Int -> CodeUnitIndex -> Int
CodeUnitIndex -> Int
(Int -> CodeUnitIndex -> Int)
-> (CodeUnitIndex -> Int) -> Hashable CodeUnitIndex
forall a. (Int -> a -> Int) -> (a -> Int) -> Hashable a
hash :: CodeUnitIndex -> Int
$chash :: CodeUnitIndex -> Int
hashWithSalt :: Int -> CodeUnitIndex -> Int
$chashWithSalt :: Int -> CodeUnitIndex -> Int
Hashable, Integer -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex
CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
(CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (CodeUnitIndex -> CodeUnitIndex)
-> (Integer -> CodeUnitIndex)
-> Num CodeUnitIndex
forall a.
(a -> a -> a)
-> (a -> a -> a)
-> (a -> a -> a)
-> (a -> a)
-> (a -> a)
-> (a -> a)
-> (Integer -> a)
-> Num a
fromInteger :: Integer -> CodeUnitIndex
$cfromInteger :: Integer -> CodeUnitIndex
signum :: CodeUnitIndex -> CodeUnitIndex
$csignum :: CodeUnitIndex -> CodeUnitIndex
abs :: CodeUnitIndex -> CodeUnitIndex
$cabs :: CodeUnitIndex -> CodeUnitIndex
negate :: CodeUnitIndex -> CodeUnitIndex
$cnegate :: CodeUnitIndex -> CodeUnitIndex
* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c* :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c- :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
$c+ :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
Num, CodeUnitIndex -> ()
(CodeUnitIndex -> ()) -> NFData CodeUnitIndex
forall a. (a -> ()) -> NFData a
rnf :: CodeUnitIndex -> ()
$crnf :: CodeUnitIndex -> ()
NFData, Value -> Parser [CodeUnitIndex]
Value -> Parser CodeUnitIndex
(Value -> Parser CodeUnitIndex)
-> (Value -> Parser [CodeUnitIndex]) -> FromJSON CodeUnitIndex
forall a.
(Value -> Parser a) -> (Value -> Parser [a]) -> FromJSON a
parseJSONList :: Value -> Parser [CodeUnitIndex]
$cparseJSONList :: Value -> Parser [CodeUnitIndex]
parseJSON :: Value -> Parser CodeUnitIndex
$cparseJSON :: Value -> Parser CodeUnitIndex
FromJSON, [CodeUnitIndex] -> Encoding
[CodeUnitIndex] -> Value
CodeUnitIndex -> Encoding
CodeUnitIndex -> Value
(CodeUnitIndex -> Value)
-> (CodeUnitIndex -> Encoding)
-> ([CodeUnitIndex] -> Value)
-> ([CodeUnitIndex] -> Encoding)
-> ToJSON CodeUnitIndex
forall a.
(a -> Value)
-> (a -> Encoding)
-> ([a] -> Value)
-> ([a] -> Encoding)
-> ToJSON a
toEncodingList :: [CodeUnitIndex] -> Encoding
$ctoEncodingList :: [CodeUnitIndex] -> Encoding
toJSONList :: [CodeUnitIndex] -> Value
$ctoJSONList :: [CodeUnitIndex] -> Value
toEncoding :: CodeUnitIndex -> Encoding
$ctoEncoding :: CodeUnitIndex -> Encoding
toJSON :: CodeUnitIndex -> Value
$ctoJSON :: CodeUnitIndex -> Value
ToJSON)
#else
    deriving newtype (Hashable, Num, NFData)
#endif

data Text
  -- | A placeholder data type for UTF-8 encoded text until we can use text-2.0.
  = Text
      !ByteArray -- ^ Underlying array encoded using UTF-8.
      !Int -- ^ Starting position of the UTF-8 sequence in bytes.
      !Int -- ^ Length of the UTF-8 sequence in bytes.

-- This instance, as well as the Show instance above, is necessary for the test suite.
instance Eq Text where
  (Text ByteArray
u8data Int
offset Int
length) == :: Text -> Text -> Bool
== (Text ByteArray
u8data' Int
offset' Int
length') =
    Int
length Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
length' Bool -> Bool -> Bool
&& ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering
compareByteArrays ByteArray
u8data Int
offset ByteArray
u8data' Int
offset' Int
length Ordering -> Ordering -> Bool
forall a. Eq a => a -> a -> Bool
== Ordering
EQ

instance Ord Text where
  compare :: Text -> Text -> Ordering
compare (Text ByteArray
u8data Int
offset Int
length) (Text ByteArray
u8data' Int
offset' Int
length') =
    ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering
compareByteArrays ByteArray
u8data Int
offset ByteArray
u8data' Int
offset' (Int -> Int -> Int
forall a. Ord a => a -> a -> a
min Int
length Int
length') Ordering -> Ordering -> Ordering
forall a. Semigroup a => a -> a -> a
<> Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare Int
length Int
length'

instance Show Text where
  show :: Text -> String
show = ShowS
forall a. Show a => a -> String
show ShowS -> (Text -> String) -> Text -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
unpack

-- Instances required for the Searcher modules etc.

#if defined(HAS_AESON)
-- NOTE: This is ugly and slow but will be removed once we move to text-2.0.
instance ToJSON Text where
  toJSON :: Text -> Value
toJSON = Text -> Value
String (Text -> Value) -> (Text -> Text) -> Text -> Value
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
T.pack (String -> Text) -> (Text -> String) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
unpack

instance FromJSON Text where
  parseJSON :: Value -> Parser Text
parseJSON = String -> (Text -> Parser Text) -> Value -> Parser Text
forall a. String -> (Text -> Parser a) -> Value -> Parser a
withText String
"Data.Text.Utf8.Text" (Text -> Parser Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Text -> Parser Text) -> (Text -> Text) -> Text -> Parser Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> Text
pack (String -> Text) -> (Text -> String) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
T.unpack)
#endif

-- Copied from https://hackage.haskell.org/package/hashable-1.4.0.2/docs/src/Data.Hashable.Class.html#line-746
instance Hashable Text where
  hashWithSalt :: Int -> Text -> Int
hashWithSalt Int
salt (Text (ByteArray ByteArray#
arr) Int
off Int
len) =
    ByteArray# -> Int -> Int -> Int -> Int
hashByteArrayWithSalt ByteArray#
arr (Int
off Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
1) (Int
len Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
1) (Int -> Int -> Int
forall a. Hashable a => Int -> a -> Int
hashWithSalt Int
salt Int
len)

instance NFData Text where
  rnf :: Text -> ()
rnf (Text (ByteArray !ByteArray#
_) !Int
_ !Int
_) = ()

instance IsString Text where
  fromString :: String -> Text
fromString = String -> Text
pack

{-# INLINABLE unpackUtf8 #-}
unpackUtf8 :: Text -> [CodeUnit]
unpackUtf8 :: Text -> [CodeUnit]
unpackUtf8 (Text ByteArray
u8data Int
offset Int
length) =
  let
    go :: Int -> t -> [CodeUnit]
go Int
_ t
0 = []
    go Int
i t
n = ByteArray -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' ByteArray
u8data (Int -> CodeUnitIndex
CodeUnitIndex Int
i) CodeUnit -> [CodeUnit] -> [CodeUnit]
forall a. a -> [a] -> [a]
: Int -> t -> [CodeUnit]
go (Int
i Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
1) (t
n t -> t -> t
forall a. Num a => a -> a -> a
- t
1)
  in
    Int -> Int -> [CodeUnit]
forall t. (Eq t, Num t) => Int -> t -> [CodeUnit]
go Int
offset Int
length

-- | The return value of this function is not really an index.
-- However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.
lengthUtf8 :: Text -> CodeUnitIndex
lengthUtf8 :: Text -> CodeUnitIndex
lengthUtf8 (Text ByteArray
_ Int
_ !Int
length) = Int -> CodeUnitIndex
CodeUnitIndex Int
length

-- | Convert a 'Text' value into a 'T.Text' value.
toUtf16Text :: Text -> T.Text
toUtf16Text :: Text -> Text
toUtf16Text (Text ByteArray
u8data Int
off Int
len) =
  (CodeUnitIndex -> Maybe (Char, CodeUnitIndex))
-> CodeUnitIndex -> Text
forall a. (a -> Maybe (Char, a)) -> a -> Text
T.unfoldr CodeUnitIndex -> Maybe (Char, CodeUnitIndex)
go CodeUnitIndex
0
  where
    go :: CodeUnitIndex -> Maybe (Char, CodeUnitIndex)
    go :: CodeUnitIndex -> Maybe (Char, CodeUnitIndex)
go CodeUnitIndex
i
      | CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> CodeUnitIndex
CodeUnitIndex Int
len = Maybe (Char, CodeUnitIndex)
forall a. Maybe a
Nothing
      | Bool
otherwise =
        let
          (CodeUnitIndex
codeUnits, Char
codePoint) = ByteArray -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint' ByteArray
u8data (CodeUnitIndex -> (CodeUnitIndex, Char))
-> CodeUnitIndex -> (CodeUnitIndex, Char)
forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex Int
off CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
i
        in
          (Char, CodeUnitIndex) -> Maybe (Char, CodeUnitIndex)
forall a. a -> Maybe a
Just (Char
codePoint, CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
codeUnits)

-- | Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.
{-# INLINE toLowerAscii #-}
toLowerAscii :: Char -> Char
toLowerAscii :: Char -> Char
toLowerAscii Char
cp
  | Char -> Bool
Char.isAsciiUpper Char
cp = Int -> Char
Char.chr (Char -> Int
Char.ord Char
cp Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
0x20)
  | Bool
otherwise = Char
cp

-- TODO: Slow placeholder implementation until we can use text-2.0
{-# INLINE lowerUtf8 #-}
lowerUtf8 :: Text -> Text
lowerUtf8 :: Text -> Text
lowerUtf8 = String -> Text
pack (String -> Text) -> (Text -> String) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char) -> ShowS
forall a b. (a -> b) -> [a] -> [b]
map Char -> Char
lowerCodePoint ShowS -> (Text -> String) -> Text -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
unpack

asciiCount :: Int
asciiCount :: Int
asciiCount = Int
128

{-# INLINE lowerCodePoint #-}
-- | Lower-Case a UTF-8 codepoint.
-- Uses 'toLowerAscii' for ASCII and 'Char.toLower' otherwise.
lowerCodePoint :: Char -> Char
lowerCodePoint :: Char -> Char
lowerCodePoint Char
cp
  | Char -> Int
Char.ord Char
cp Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
asciiCount = Char -> Char
toLowerAscii Char
cp
  | Bool
otherwise = Char -> Char
Char.toLower Char
cp

-- | Convert a Unicode Code Point 'c' into a list of UTF-8 code units (bytes).
unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
unicode2utf8 :: a -> [a]
unicode2utf8 a
c
    | a
c a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
0x80    = [a
c]
    | a
c a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
0x800   = [a
0xc0 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
c)]
    | a
c a -> a -> Bool
forall a. Ord a => a -> a -> Bool
< a
0x10000 = [a
0xe0 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
c)]
    | Bool
otherwise   = [a
0xf0 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
18), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
12)), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. (a
c a -> Int -> a
forall a. Bits a => a -> Int -> a
`shiftR` Int
6)), a
0x80 a -> a -> a
forall a. Bits a => a -> a -> a
.|. (a
0x3f a -> a -> a
forall a. Bits a => a -> a -> a
.&. a
c)]

-- $decoding
--
-- Functions that turns code unit sequences into code point sequences.

-- | Decode 2 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┐
-- > │1 1 0 x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┘
{-# INLINE decode2 #-}
decode2 :: CodeUnit -> CodeUnit -> CodePoint
decode2 :: CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 CodeUnit
cu1 =
  Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x1f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f

-- | Decode 3 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┬───────────────┐
-- > │1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┴───────────────┘
{-# INLINE decode3 #-}
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 =
  Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0xf) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu2 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)

-- | Decode 4 UTF-8 code units into their code point.
-- The given code units should have the following format:
--
-- > ┌───────────────┬───────────────┬───────────────┬───────────────┐
-- > │1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
-- > └───────────────┴───────────────┴───────────────┴───────────────┘
{-# INLINE decode4 #-}
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3 =
  Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x7) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
18 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu1 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
12 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu2 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f) Int -> Int -> Int
forall a. Bits a => a -> Int -> a
`shiftL` Int
6 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.|. (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu3 Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. Int
0x3f)

-- | Decode a list of UTF-8 code units into a list of code points.
decodeUtf8 :: [CodeUnit] -> [CodePoint]
decodeUtf8 :: [CodeUnit] -> String
decodeUtf8 [] = []
decodeUtf8 (CodeUnit
cu0 : [CodeUnit]
cus) | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xc0 = Int -> Char
Char.chr (CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0) Char -> ShowS
forall a. a -> [a] -> [a]
: [CodeUnit] -> String
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : [CodeUnit]
cus) | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe0 = CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 CodeUnit
cu1 Char -> ShowS
forall a. a -> [a] -> [a]
: [CodeUnit] -> String
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : CodeUnit
cu2 : [CodeUnit]
cus) | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf0 = CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 Char -> ShowS
forall a. a -> [a] -> [a]
: [CodeUnit] -> String
decodeUtf8 [CodeUnit]
cus
decodeUtf8 (CodeUnit
cu0 : CodeUnit
cu1 : CodeUnit
cu2 : CodeUnit
cu3 : [CodeUnit]
cus) | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf8 = CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 CodeUnit
cu1 CodeUnit
cu2 CodeUnit
cu3 Char -> ShowS
forall a. a -> [a] -> [a]
: [CodeUnit] -> String
decodeUtf8 [CodeUnit]
cus
decodeUtf8 [CodeUnit]
cus = ShowS
forall a. HasCallStack => String -> a
error ShowS -> ShowS
forall a b. (a -> b) -> a -> b
$ String
"Invalid UTF-8 input sequence at " String -> ShowS
forall a. [a] -> [a] -> [a]
++ [CodeUnit] -> String
forall a. Show a => a -> String
show (Int -> [CodeUnit] -> [CodeUnit]
forall a. Int -> [a] -> [a]
take Int
4 [CodeUnit]
cus)

stringToByteArray :: String -> ByteArray
stringToByteArray :: String -> ByteArray
stringToByteArray = [CodeUnit] -> ByteArray
forall a. Prim a => [a] -> ByteArray
byteArrayFromList ([CodeUnit] -> ByteArray)
-> (String -> [CodeUnit]) -> String -> ByteArray
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> [CodeUnit]) -> String -> [CodeUnit]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Char -> [CodeUnit]
char2utf8
        -- See https://en.wikipedia.org/wiki/UTF-8
        where
            char2utf8 :: Char -> [Word8]
            char2utf8 :: Char -> [CodeUnit]
char2utf8 = (Int -> CodeUnit) -> [Int] -> [CodeUnit]
forall a b. (a -> b) -> [a] -> [b]
map Int -> CodeUnit
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([Int] -> [CodeUnit]) -> (Char -> [Int]) -> Char -> [CodeUnit]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> [Int]
forall a. (Ord a, Num a, Bits a) => a -> [a]
unicode2utf8 (Int -> [Int]) -> (Char -> Int) -> Char -> [Int]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Int
ord

-- $indexing
--
-- 'Text' can be indexed by code units or code points.
-- A 'CodePoint' is a 21-bit Unicode code point and can consist of up to four code units.
-- A 'CodeUnit' is a single byte.

-- | Decode a code point at the given 'CodeUnitIndex'.
-- Returns garbage if there is no valid code point at that position.
-- Does not perform bounds checking.
-- See 'decode2', 'decode3' and 'decode4' for the expected format of multi-byte code points.
{-# INLINE unsafeIndexCodePoint' #-}
unsafeIndexCodePoint' :: ByteArray -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodePoint' :: ByteArray -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint' !ByteArray
u8data (CodeUnitIndex !Int
idx)
  | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xc0 = (CodeUnitIndex
1, Int -> Char
Char.chr (Int -> Char) -> Int -> Char
forall a b. (a -> b) -> a -> b
$ CodeUnit -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral CodeUnit
cu0)
  | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xe0 = (CodeUnitIndex
2, CodeUnit -> CodeUnit -> Char
decode2 CodeUnit
cu0 (Int -> CodeUnit
cuAt Int
1))
  | CodeUnit
cu0 CodeUnit -> CodeUnit -> Bool
forall a. Ord a => a -> a -> Bool
< CodeUnit
0xf0 = (CodeUnitIndex
3, CodeUnit -> CodeUnit -> CodeUnit -> Char
decode3 CodeUnit
cu0 (Int -> CodeUnit
cuAt Int
1) (Int -> CodeUnit
cuAt Int
2))
  | Bool
otherwise = (CodeUnitIndex
4, CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> Char
decode4 CodeUnit
cu0 (Int -> CodeUnit
cuAt Int
1) (Int -> CodeUnit
cuAt Int
2) (Int -> CodeUnit
cuAt Int
3))
  where
    cuAt :: Int -> CodeUnit
cuAt !Int
i = ByteArray -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' ByteArray
u8data (CodeUnitIndex -> CodeUnit) -> CodeUnitIndex -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex) -> Int -> CodeUnitIndex
forall a b. (a -> b) -> a -> b
$ Int
idx Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i
    !cu0 :: CodeUnit
cu0 = Int -> CodeUnit
cuAt Int
0

-- | Does exactly the same thing as 'unsafeIndexCodePoint'', but on 'Text' values.
{-# INLINE unsafeIndexCodePoint #-}
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint (Text !ByteArray
u8data !Int
off !Int
_len) (CodeUnitIndex !Int
index) =
  ByteArray -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint' ByteArray
u8data (CodeUnitIndex -> (CodeUnitIndex, Char))
-> CodeUnitIndex -> (CodeUnitIndex, Char)
forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex) -> Int -> CodeUnitIndex
forall a b. (a -> b) -> a -> b
$ Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
index

-- | Get the code unit at the given 'CodeUnitIndex'.
-- Performs bounds checking.
{-# INLINE indexCodeUnit #-}
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
indexCodeUnit !Text
text (CodeUnitIndex !Int
index)
  | Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
0 Bool -> Bool -> Bool
|| Int
index Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnitIndex -> Int
codeUnitIndex (Text -> CodeUnitIndex
lengthUtf8 Text
text) = String -> CodeUnit
forall a. HasCallStack => String -> a
error (String -> CodeUnit) -> String -> CodeUnit
forall a b. (a -> b) -> a -> b
$ String
"Index out of bounds " String -> ShowS
forall a. [a] -> [a] -> [a]
++ Int -> String
forall a. Show a => a -> String
show Int
index
  | Bool
otherwise = Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit Text
text (CodeUnitIndex -> CodeUnit) -> CodeUnitIndex -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex Int
index

{-# INLINE unsafeIndexCodeUnit' #-}
unsafeIndexCodeUnit' :: ByteArray -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' :: ByteArray -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' !ByteArray
u8data (CodeUnitIndex !Int
idx) = ByteArray -> Int -> CodeUnit
forall a. Prim a => ByteArray -> Int -> a
indexByteArray ByteArray
u8data Int
idx

{-# INLINE unsafeIndexCodeUnit #-}
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit (Text !ByteArray
u8data !Int
off !Int
_len) (CodeUnitIndex !Int
index) =
  ByteArray -> CodeUnitIndex -> CodeUnit
unsafeIndexCodeUnit' ByteArray
u8data (CodeUnitIndex -> CodeUnit) -> CodeUnitIndex -> CodeUnit
forall a b. (a -> b) -> a -> b
$ Int -> CodeUnitIndex
CodeUnitIndex (Int -> CodeUnitIndex) -> Int -> CodeUnitIndex
forall a b. (a -> b) -> a -> b
$ Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
index

-- $slicingFunctions
--
-- 'unsafeCutUtf8' and 'unsafeSliceUtf8' are used to retrieve slices of 'Text' values.
-- @unsafeSliceUtf8 begin length@ returns a substring of length @length@ starting at @begin@.
-- @unsafeSliceUtf8 begin length@ returns a tuple of the "surrounding" substrings.
--
-- They satisfy the following property:
--
-- > let (prefix, suffix) = unsafeCutUtf8 begin length t
-- > in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t
--
-- The following diagram visualizes the relevant offsets for @begin = CodeUnitIndex 2@, @length = CodeUnitIndex 6@ and @t = \"BCDEFGHIJKL\"@.
--
-- >  off                 off+len
-- >   │                     │
-- >   ▼                     ▼
-- > ──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
-- >  A│B│C│D│E│F│G│H│I│J│K│L│M│N
-- > ──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
-- >       ▲           ▲
-- >       │           │
-- >  off+begin   off+begin+length
-- >
-- > unsafeSliceUtf8 begin length t == "DEFGHI"
-- > unsafeCutUtf8 begin length t == ("BC", "JKL")
--
-- The shown array is open at each end because in general, @t@ may be a slice as well.
--
-- __WARNING__: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.

-- TODO: Make this more readable once we have text-2.0.
unsafeCutUtf8 :: CodeUnitIndex -- ^ Starting position of substring.
  -> CodeUnitIndex -- ^ Length of substring.
  -> Text -- ^ Initial string.
  -> (Text, Text)
unsafeCutUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeCutUtf8 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
length) (Text !ByteArray
u8data !Int
off !Int
len) =
  ( ByteArray -> Int -> Int -> Text
Text ByteArray
u8data Int
off Int
begin
  , ByteArray -> Int -> Int -> Text
Text ByteArray
u8data (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
length) (Int
len Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
begin Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
length)
  )

-- TODO: Make this more readable once we have text-2.0.
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf8 (CodeUnitIndex !Int
begin) (CodeUnitIndex !Int
length) (Text !ByteArray
u8data !Int
off !Int
_len) =
  ByteArray -> Int -> Int -> Text
Text ByteArray
u8data (Int
off Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
begin) Int
length

-- $generalFunctions
--
-- These functions are available in @text@ as well and should be removed once this library moves to @text-2@.
-- You should be able to use these by doing @import qualified Data.Text.Utf8 as Text@ just like you would with @text@.
--
-- NOTE: The 'Text' instances for @Show@, @Eq@, @Ord@, @IsString@, @FromJSON@, @ToJSON@ and @Hashable@ in this file also fall in this category.

-- | TODO: Inefficient placeholder implementation.
concat :: [Text] -> Text
concat :: [Text] -> Text
concat = String -> Text
pack (String -> Text) -> ([Text] -> String) -> [Text] -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> String) -> [Text] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Text -> String
unpack

-- | See 'Data.Text.dropWhile'.
dropWhile :: (Char -> Bool) -> Text -> Text
dropWhile :: (Char -> Bool) -> Text -> Text
dropWhile Char -> Bool
predicate Text
text =
  let
    len :: Int
len = CodeUnitIndex -> Int
codeUnitIndex (Text -> CodeUnitIndex
lengthUtf8 Text
text)
    go :: CodeUnitIndex -> CodeUnitIndex
go CodeUnitIndex
i
      | CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= Int -> CodeUnitIndex
CodeUnitIndex Int
len = CodeUnitIndex
i
      | Bool
otherwise =
        let
          (CodeUnitIndex
codeUnits, Char
codePoint) = Text -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint Text
text CodeUnitIndex
i
        in
          if Char -> Bool
predicate Char
codePoint then
            CodeUnitIndex -> CodeUnitIndex
go (CodeUnitIndex -> CodeUnitIndex) -> CodeUnitIndex -> CodeUnitIndex
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
i CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
codeUnits
          else
            CodeUnitIndex
i

    prefixEnd :: CodeUnitIndex
prefixEnd = CodeUnitIndex -> CodeUnitIndex
go CodeUnitIndex
0
  in
    CodeUnitIndex -> CodeUnitIndex -> Text -> Text
unsafeSliceUtf8 CodeUnitIndex
prefixEnd (Int -> CodeUnitIndex
CodeUnitIndex Int
len CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
- CodeUnitIndex
prefixEnd) Text
text

-- | Checks whether a text is the empty string.
null :: Text -> Bool
null :: Text -> Bool
null (Text ByteArray
_ Int
_ Int
len) = Int
len Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0

-- | TODO: Inefficient placeholder implementation.
pack :: String -> Text
pack :: String -> Text
pack = ByteArray -> Text
go (ByteArray -> Text) -> (String -> ByteArray) -> String -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ByteArray
stringToByteArray
  where
    go :: ByteArray -> Text
go !ByteArray
arr = ByteArray -> Int -> Int -> Text
Text ByteArray
arr Int
0 (Int -> Text) -> Int -> Text
forall a b. (a -> b) -> a -> b
$ ByteArray -> Int
sizeofByteArray ByteArray
arr

-- | TODO: Inefficient placeholder implementation.
-- See 'Data.Text.replicate'
replicate :: Int -> Text -> Text
replicate :: Int -> Text -> Text
replicate Int
n = String -> Text
pack (String -> Text) -> (Text -> String) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
Prelude.concat ([String] -> String) -> (Text -> [String]) -> Text -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> String -> [String]
forall a. Int -> a -> [a]
Prelude.replicate Int
n (String -> [String]) -> (Text -> String) -> Text -> [String]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> String
unpack

-- | TODO: Inefficient placeholder implementation.
-- This function implements very basic string search. It's @text@ counterpart is 'Data.Text.Internal.Search.indices', which implements the Boyer-Moore algorithm.
-- Since we have this function only to check whether our own Boyer-Moore implementation works, it would not make much sense to implement it using the same algorithm.
-- Once we can use @text-2@, we can compare our implementation to the official @text@ one which presumably works.
indices :: Text -> Text -> [Int]
indices :: Text -> Text -> [Int]
indices Text
needle Text
haystack
  | CodeUnitIndex
needleLen CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Eq a => a -> a -> Bool
== CodeUnitIndex
0 = []
  | Bool
otherwise = CodeUnitIndex -> CodeUnitIndex -> [Int]
go CodeUnitIndex
0 CodeUnitIndex
0
  where
    needleLen :: CodeUnitIndex
needleLen = Text -> CodeUnitIndex
lengthUtf8 Text
needle
    haystackLen :: CodeUnitIndex
haystackLen = Text -> CodeUnitIndex
lengthUtf8 Text
haystack

    go :: CodeUnitIndex -> CodeUnitIndex -> [Int]
go CodeUnitIndex
startIdx CodeUnitIndex
needleIdx
      -- needle is longer than remaining haystack
      | CodeUnitIndex
startIdx CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
needleLen CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
> CodeUnitIndex
haystackLen = []
      -- whole needle matched
      | CodeUnitIndex
needleIdx CodeUnitIndex -> CodeUnitIndex -> Bool
forall a. Ord a => a -> a -> Bool
>= CodeUnitIndex
needleLen = CodeUnitIndex -> Int
codeUnitIndex CodeUnitIndex
startIdx Int -> [Int] -> [Int]
forall a. a -> [a] -> [a]
: CodeUnitIndex -> CodeUnitIndex -> [Int]
go (CodeUnitIndex
startIdx CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
needleLen) CodeUnitIndex
0
      -- charachter mismatch
      | Char
needleCp Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/= Char
haystackCp = CodeUnitIndex -> CodeUnitIndex -> [Int]
go (CodeUnitIndex
startIdx CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
1) CodeUnitIndex
0
      -- advance
      | Bool
otherwise = CodeUnitIndex -> CodeUnitIndex -> [Int]
go CodeUnitIndex
startIdx (CodeUnitIndex -> [Int]) -> CodeUnitIndex -> [Int]
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
needleIdx CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
codeUnits
      where
        (CodeUnitIndex
codeUnits, Char
needleCp) = Text -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint Text
needle CodeUnitIndex
needleIdx
        (CodeUnitIndex
_, Char
haystackCp) = Text -> CodeUnitIndex -> (CodeUnitIndex, Char)
unsafeIndexCodePoint Text
haystack (CodeUnitIndex -> (CodeUnitIndex, Char))
-> CodeUnitIndex -> (CodeUnitIndex, Char)
forall a b. (a -> b) -> a -> b
$ CodeUnitIndex
startIdx CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex
forall a. Num a => a -> a -> a
+ CodeUnitIndex
needleIdx

-- | TODO: Inefficient placeholder implementation.
isInfixOf :: Text -> Text -> Bool
isInfixOf :: Text -> Text -> Bool
isInfixOf Text
needle Text
haystack = Text -> Text -> Bool
T.isInfixOf (Text -> Text
toUtf16Text Text
needle) (Text -> Text
toUtf16Text Text
haystack)

-- | See 'Data.Text.IO.readFile'.
-- TODO: Uses 'Data.ByteString.readFile' and loops through each byte individually.
-- Use 'Data.Primitive.Ptr.copyPtrToMutableByteArray' here if possible.
readFile :: FilePath -> IO Text
readFile :: String -> IO Text
readFile String
path = do
  ByteString
contents <- String -> IO ByteString
BS.readFile String
path
  MutableByteArray RealWorld
array <- Int -> IO (MutableByteArray (PrimState IO))
forall (m :: * -> *).
PrimMonad m =>
Int -> m (MutableByteArray (PrimState m))
newByteArray (Int -> IO (MutableByteArray (PrimState IO)))
-> Int -> IO (MutableByteArray (PrimState IO))
forall a b. (a -> b) -> a -> b
$ ByteString -> Int
BS.length ByteString
contents
  [Int] -> (Int -> IO ()) -> IO ()
forall (t :: * -> *) (f :: * -> *) a b.
(Foldable t, Applicative f) =>
t a -> (a -> f b) -> f ()
for_ [Int
0..ByteString -> Int
BS.length ByteString
contents Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
1] ((Int -> IO ()) -> IO ()) -> (Int -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \Int
i -> do
    MutableByteArray (PrimState IO) -> Int -> CodeUnit -> IO ()
forall a (m :: * -> *).
(Prim a, PrimMonad m) =>
MutableByteArray (PrimState m) -> Int -> a -> m ()
writeByteArray MutableByteArray RealWorld
MutableByteArray (PrimState IO)
array Int
i (CodeUnit -> IO ()) -> CodeUnit -> IO ()
forall a b. (a -> b) -> a -> b
$ HasCallStack => ByteString -> Int -> CodeUnit
ByteString -> Int -> CodeUnit
BS.index ByteString
contents Int
i
  ByteArray
array' <- MutableByteArray (PrimState IO) -> IO ByteArray
forall (m :: * -> *).
PrimMonad m =>
MutableByteArray (PrimState m) -> m ByteArray
unsafeFreezeByteArray MutableByteArray RealWorld
MutableByteArray (PrimState IO)
array
  Text -> IO Text
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ ByteArray -> Int -> Int -> Text
Text ByteArray
array' Int
0 (Int -> Text) -> Int -> Text
forall a b. (a -> b) -> a -> b
$ ByteString -> Int
BS.length ByteString
contents

-- | TODO: Inefficient placeholder implementation.
unpack :: Text -> String
unpack :: Text -> String
unpack = [CodeUnit] -> String
decodeUtf8 ([CodeUnit] -> String) -> (Text -> [CodeUnit]) -> Text -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [CodeUnit]
unpackUtf8