{-# LANGUAGE MagicHash, UnboxedTuples #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE UnliftedFFITypes #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE PatternSynonyms #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE CPP #-}
module Std.Data.Text.Base (
Text(..)
, validate
, validateMaybe
, replicate
, cycleN
, indexMaybe, charByteIndex, indexMaybeR, charByteIndexR
, empty, singleton, copy
, pack, packN, packR, packRN
, unpack, unpackR
, fromVector
, toVector
, null
, length
, append
, map', imap'
, foldl', ifoldl'
, foldr', ifoldr'
, concat, concatMap
, count, all, any
, NormalizationResult(..), NormalizeMode(..)
, isNormalized, isNormalizedTo, normalize, normalizeTo
, Locale, localeDefault, localeLithuanian, localeTurkishAndAzeriLatin
, caseFold, caseFoldWith, toLower, toLowerWith, toUpper, toUpperWith, toTitle, toTitleWith
, isCategory, spanCategory
, Category
, categoryLetterUppercase
, categoryLetterLowercase
, categoryLetterTitlecase
, categoryLetterOther
, categoryLetter
, categoryCaseMapped
, categoryMarkNonSpacing
, categoryMarkSpacing
, categoryMarkEnclosing
, categoryMark
, categoryNumberDecimal
, categoryNumberLetter
, categoryNumberOther
, categoryNumber
, categoryPunctuationConnector
, categoryPunctuationDash
, categoryPunctuationOpen
, categoryPunctuationClose
, categoryPunctuationInitial
, categoryPunctuationFinal
, categoryPunctuationOther
, categoryPunctuation
, categorySymbolMath
, categorySymbolCurrency
, categorySymbolModifier
, categorySymbolOther
, categorySymbol
, categorySeparatorSpace
, categorySeparatorLine
, categorySeparatorParagraph
, categorySeparator
, categoryControl
, categoryFormat
, categorySurrogate
, categoryPrivateUse
, categoryUnassigned
, categoryCompatibility
, categoryIgnoreGraphemeCluste
, categoryIscntrl
, categoryIsprint
, categoryIsspace
, categoryIsblank
, categoryIsgraph
, categoryIspunct
, categoryIsalnum
, categoryIsalpha
, categoryIsupper
, categoryIslower
, categoryIsdigit
, categoryIsxdigit
, c_utf8_validate_ba
, c_utf8_validate_addr
) where
import Control.DeepSeq
import Control.Monad.ST
import Control.Monad
import Data.Bits
import Data.Char hiding (toLower, toUpper, toTitle)
import Data.Foldable (foldlM)
import Data.Hashable (Hashable(..))
import qualified Data.List as List
import Data.Monoid (Monoid (..))
import Data.Primitive.PrimArray
import Data.Semigroup (Semigroup ((<>)))
import Data.Typeable
import Data.String (IsString(..))
import Data.Word
import Foreign.C.Types (CSize(..))
import GHC.Exts (build)
import GHC.Ptr
import GHC.Types
import GHC.Stack
import GHC.CString (unpackCString#, unpackCStringUtf8#)
import Std.Data.Array
import Std.Data.Text.UTF8Codec
import Std.Data.Text.UTF8Rewind
import Std.Data.Vector.Base (Bytes, PrimVector(..), c_strlen)
import qualified Std.Data.Vector.Base as V
import qualified Std.Data.Vector.Extra as V
import qualified Std.Data.Vector.Search as V
import Std.Foreign.PrimArray
import System.IO.Unsafe (unsafeDupablePerformIO)
import Prelude hiding (concat, concatMap,
elem, notElem, null, length, map,
foldl, foldl1, foldr, foldr1,
maximum, minimum, product, sum,
all, any, replicate, traverse)
import Test.QuickCheck.Arbitrary (Arbitrary(..), CoArbitrary(..))
newtype Text = Text
{ getUTF8Bytes :: Bytes
} deriving (Semigroup, Monoid)
instance Eq Text where
Text b1 == Text b2 = b1 == b2
{-# INLINE (==) #-}
instance Ord Text where
Text b1 `compare` Text b2 = b1 `compare` b2
{-# INLINE compare #-}
instance Show Text where
showsPrec p t = showsPrec p (unpack t)
instance Read Text where
readsPrec p str = [ (pack x, y) | (x, y) <- readsPrec p str ]
instance NFData Text where
rnf (Text bs) = rnf bs
instance Arbitrary Text where
arbitrary = pack <$> arbitrary
shrink a = pack <$> shrink (unpack a)
instance CoArbitrary Text where
coarbitrary = coarbitrary . unpack
instance Hashable Text where
{-# INLINE hashWithSalt #-}
hashWithSalt salt (Text bs) = hashWithSalt salt bs
instance IsString Text where
{-# INLINE fromString #-}
fromString = pack
packASCIIAddr :: Addr# -> Text
packASCIIAddr addr# = copy addr#
where
len = fromIntegral . unsafeDupablePerformIO $ c_strlen addr#
copy addr# = runST $ do
marr <- newPrimArray len
copyPtrToMutablePrimArray marr 0 (Ptr addr#) len
arr <- unsafeFreezePrimArray marr
return $ Text (PrimVector arr 0 len)
packUTF8Addr :: Addr# -> Text
packUTF8Addr addr# = validateAndCopy addr#
where
len = fromIntegral . unsafeDupablePerformIO $ c_strlen addr#
valid = unsafeDupablePerformIO $ c_utf8_validate_addr addr# len
validateAndCopy addr#
| valid == 0 = packN len (unpackCStringUtf8# addr#)
| otherwise = runST $ do
marr <- newPrimArray len
copyPtrToMutablePrimArray marr 0 (Ptr addr#) len
arr <- unsafeFreezePrimArray marr
return $ Text (PrimVector arr 0 len)
indexMaybe :: Text -> Int -> Maybe Char
{-# INLINABLE indexMaybe #-}
indexMaybe (Text (V.PrimVector ba s l)) n
| n < 0 = Nothing
| otherwise = go s 0
where
!end = s + l
go !i !j
| i >= end = Nothing
| j >= n = let !c = decodeChar_ ba i in Just c
| otherwise =
let l = decodeCharLen ba i in go (i+l) (j+1)
charByteIndex :: Text -> Int -> Int
{-# INLINABLE charByteIndex #-}
charByteIndex (Text (V.PrimVector ba s l)) n
| n < 0 = s
| otherwise = go s 0
where
!end = s + l
go !i !j
| i >= end = i
| j >= n = i
| otherwise =
let l = decodeCharLen ba i in go (i+l) (j+1)
indexMaybeR :: Text -> Int -> Maybe Char
{-# INLINABLE indexMaybeR #-}
indexMaybeR (Text (V.PrimVector ba s l)) n
| n < 0 = Nothing
| otherwise = go (s+l-1) 0
where
go !i !j
| i < s = Nothing
| j >= n = let !c = decodeCharReverse_ ba i in Just c
| otherwise =
let l = decodeCharLenReverse ba i in go (i-l) (j+1)
charByteIndexR :: Text -> Int -> Int
{-# INLINABLE charByteIndexR #-}
charByteIndexR (Text (V.PrimVector ba s l)) n
| n < 0 = s+l
| otherwise = go (s+l-1) 0
where
go !i !j
| i < s = i
| j >= n = i
| otherwise =
let l = decodeCharLenReverse ba i in go (i-l) (j+1)
validate :: HasCallStack => Bytes -> Text
{-# INLINE validate #-}
validate bs@(V.PrimVector (PrimArray ba#) (I# s#) l@(I# l#))
| l == 0 = Text bs
| c_utf8_validate_ba ba# s# l# > 0 = Text bs
| otherwise = error "invalid UTF8 bytes"
validateMaybe :: Bytes -> Maybe Text
{-# INLINE validateMaybe #-}
validateMaybe bs@(V.PrimVector (PrimArray ba#) (I# s#) l@(I# l#))
| l == 0 = Just (Text bs)
| c_utf8_validate_ba ba# s# l# > 0 = Just (Text bs)
| otherwise = Nothing
foreign import ccall unsafe "text.h utf8_validate"
c_utf8_validate_ba :: BA# Word8 -> Int# -> Int# -> Int
foreign import ccall unsafe "text.h utf8_validate_addr"
c_utf8_validate_addr :: Addr# -> Int -> IO Int
pack :: String -> Text
pack = packN V.defaultInitSize
{-# INLINE CONLIKE [0] pack #-}
{-# RULES "pack/packASCIIAddr" forall addr . pack (unpackCString# addr) = packASCIIAddr addr #-}
{-# RULES "pack/packUTF8Addr" forall addr . pack (unpackCStringUtf8# addr) = packUTF8Addr addr #-}
packN :: Int -> String -> Text
{-# INLINE packN #-}
packN n0 = \ ws0 ->
Text (V.create' (max 4 n0) (\ marr -> foldlM go (V.IPair 0 marr) ws0))
where
go :: V.IPair (MutablePrimArray s Word8) -> Char -> ST s (V.IPair (MutablePrimArray s Word8))
go (V.IPair i marr) !c = do
siz <- getSizeofMutablePrimArray marr
if i < siz - 3
then do
i' <- encodeChar marr i c
return (V.IPair i' marr)
else do
let !siz' = siz `shiftL` 1
!marr' <- resizeMutablePrimArray marr siz'
i' <- encodeChar marr' i c
return (V.IPair i' marr')
packR :: String -> Text
{-# INLINE packR #-}
packR = packRN V.defaultInitSize
packRN :: Int -> String -> Text
{-# INLINE packRN #-}
packRN n0 = \ ws0 -> runST (do let n = max 4 n0
marr <- newArr n
(V.IPair i marr') <- foldM go (V.IPair n marr) ws0
ba <- unsafeFreezeArr marr'
return $! Text (V.fromArr ba i (sizeofArr ba-i))
)
where
go :: V.IPair (MutablePrimArray s Word8) -> Char -> ST s (V.IPair (MutablePrimArray s Word8))
go (V.IPair i marr) !c = do
n <- sizeofMutableArr marr
let l = encodeCharLength c
if i >= l
then do encodeChar marr (i-l) c
return (V.IPair (i-l) marr)
else do let !n' = n `shiftL` 1
!marr' <- newArr n'
copyMutableArr marr' (n+i) marr i (n-i)
let i' = n+i-l
encodeChar marr' i' c
return (V.IPair i' marr')
unpack :: Text -> String
{-# INLINE [1] unpack #-}
unpack (Text (V.PrimVector ba s l)) = go s
where
!end = s + l
go !idx
| idx >= end = []
| otherwise = let (# c, i #) = decodeChar ba idx in c : go (idx + i)
unpackFB :: Text -> (Char -> a -> a) -> a -> a
{-# INLINE [0] unpackFB #-}
unpackFB (Text (V.PrimVector ba s l)) k z = go s
where
!end = s + l
go !idx
| idx >= end = z
| otherwise = let (# c, i #) = decodeChar ba idx in c `k` go (idx + i)
{-# RULES
"unpack" [~1] forall t . unpack t = build (\ k z -> unpackFB t k z)
"unpackFB" [1] forall t . unpackFB t (:) [] = unpack t
#-}
unpackR :: Text -> String
{-# INLINE [1] unpackR #-}
unpackR (Text (V.PrimVector ba s l)) = go (s+l-1)
where
go !idx
| idx < s = []
| otherwise = let (# c, i #) = decodeCharReverse ba idx in c : go (idx - i)
unpackRFB :: Text -> (Char -> a -> a) -> a -> a
{-# INLINE [0] unpackRFB #-}
unpackRFB (Text (V.PrimVector ba s l)) k z = go (s+l-1)
where
go !idx
| idx < s = z
| otherwise = let (# c, i #) = decodeCharReverse ba idx in c `k` go (idx - i)
{-# RULES
"unpackR" [~1] forall t . unpackR t = build (\ k z -> unpackRFB t k z)
"unpackRFB" [1] forall t . unpackRFB t (:) [] = unpackR t
#-}
singleton :: Char -> Text
{-# INLINABLE singleton #-}
singleton c = Text $ V.createN 4 $ \ marr -> encodeChar marr 0 c
empty :: Text
{-# INLINABLE empty #-}
empty = Text V.empty
copy :: Text -> Text
{-# INLINE copy #-}
copy (Text bs) = Text (V.copy bs)
append :: Text -> Text -> Text
append ta tb = Text ( getUTF8Bytes ta `V.append` getUTF8Bytes tb )
{-# INLINE append #-}
null :: Text -> Bool
{-# INLINABLE null #-}
null (Text bs) = V.null bs
length :: Text -> Int
{-# INLINABLE length #-}
length (Text (V.PrimVector ba s l)) = go s 0
where
!end = s + l
go !i !acc | i >= end = acc
| otherwise = let j = decodeCharLen ba i in go (i+j) (1+acc)
map' :: (Char -> Char) -> Text -> Text
{-# INLINE map' #-}
map' f (Text (V.PrimVector arr s l)) | l == 0 = empty
| otherwise = Text (V.create' (l+3) (go s 0))
where
end = s + l
go :: Int -> Int -> MutablePrimArray s Word8 -> ST s (V.IPair (MutablePrimArray s Word8))
go !i !j !marr
| i >= end = return (V.IPair j marr)
| otherwise = do
let (# c, d #) = decodeChar arr i
j' <- encodeChar marr j (f c)
let !i' = i + d
siz <- sizeofMutableArr marr
if j' < siz - 3
then go i' j' marr
else do
let !siz' = siz `shiftL` 1
!marr' <- resizeMutablePrimArray marr siz'
go i' j' marr'
imap' :: (Int -> Char -> Char) -> Text -> Text
{-# INLINE imap' #-}
imap' f (Text (V.PrimVector arr s l)) | l == 0 = empty
| otherwise = Text (V.create' (l+3) (go s 0 0))
where
end = s + l
go :: Int -> Int -> Int -> MutablePrimArray s Word8 -> ST s (V.IPair (MutablePrimArray s Word8))
go !i !j !k !marr
| i >= end = return (V.IPair j marr)
| otherwise = do
let (# c, d #) = decodeChar arr i
j' <- encodeChar marr j (f k c)
let !i' = i + d
!k' = k + 1
siz <- sizeofMutableArr marr
if j' < siz - 3
then go i' j' k' marr
else do
let !siz' = siz `shiftL` 1
!marr' <- resizeMutablePrimArray marr siz'
go i' j' k' marr'
foldl' :: (b -> Char -> b) -> b -> Text -> b
{-# INLINE foldl' #-}
foldl' f z (Text (V.PrimVector arr s l)) = go z s
where
!end = s + l
go !acc !i | i < end = case decodeChar arr i of
(# x, d #) -> go (f acc x) (i + d)
| otherwise = acc
ifoldl' :: (b -> Int -> Char -> b) -> b -> Text -> b
{-# INLINE ifoldl' #-}
ifoldl' f z (Text (V.PrimVector arr s l)) = go z s 0
where
!end = s + l
go !acc !i !k | i < end = case decodeChar arr i of
(# x, d #) -> go (f acc k x) (i + d) (k + 1)
| otherwise = acc
foldr' :: (Char -> b -> b) -> b -> Text -> b
{-# INLINE foldr' #-}
foldr' f z (Text (V.PrimVector arr s l)) = go z (s+l-1)
where
go !acc !i | i >= s = case decodeCharReverse arr i of
(# x, d #) -> go (f x acc) (i - d)
| otherwise = acc
ifoldr' :: (Int -> Char -> b -> b) -> b -> Text -> b
{-# INLINE ifoldr' #-}
ifoldr' f z (Text (V.PrimVector arr s l)) = go z (s+l-1) 0
where
go !acc !i !k | i >= s = case decodeCharReverse arr i of
(# x, d #) -> go (f k x acc) (i - d) (k + 1)
| otherwise = acc
concat :: [Text] -> Text
concat = Text . V.concat . coerce
{-# INLINE concat #-}
concatMap :: (Char -> Text) -> Text -> Text
{-# INLINE concatMap #-}
concatMap f = concat . foldr' ((:) . f) []
count :: Char -> Text -> Int
{-# INLINE count #-}
count c (Text v)
| encodeCharLength c == 1 = let w = V.c2w c in V.count w v
| otherwise = let (Text pat) = singleton c
in List.length $ V.indices pat v False
any :: (Char -> Bool) -> Text -> Bool
{-# INLINE any #-}
any f (Text (V.PrimVector arr s l))
| l <= 0 = False
| otherwise = case decodeChar arr s of
(# x0, d #) -> go (f x0) (s+d)
where
!end = s+l
go !acc !i | acc = True
| i >= end = acc
| otherwise = case decodeChar arr i of
(# x, d #) -> go (acc || f x) (i+d)
all :: (Char -> Bool) -> Text -> Bool
{-# INLINE all #-}
all f (Text (V.PrimVector arr s l))
| l <= 0 = True
| otherwise = case decodeChar arr s of
(# x0, d #) -> go (f x0) (s+d)
where
!end = s+l
go !acc !i | not acc = False
| i >= end = acc
| otherwise = case decodeChar arr i of
(# x, d #) -> go (acc && f x) (i+d)
replicate :: Int -> Char -> Text
{-# INLINE replicate #-}
replicate 0 _ = empty
replicate n c = Text (V.create siz (go 0))
where
!csiz = encodeCharLength c
!siz = n * csiz
go :: Int -> MutablePrimArray s Word8 -> ST s ()
go 0 marr = encodeChar marr 0 c >> go csiz marr
go i marr | i >= siz = return ()
| otherwise = do copyChar' csiz marr i marr (i-csiz)
go (i+csiz) marr
cycleN :: Int -> Text -> Text
{-# INLINE cycleN #-}
cycleN 0 _ = empty
cycleN n (Text v) = Text (V.cycleN n v)
fromVector :: V.PrimVector Char -> Text
{-# INLINE fromVector #-}
fromVector (V.PrimVector arr s l) = Text (V.createN l (go s 0))
where
end = s+l
go !i !j !marr
| i >= l = return j
| otherwise = do
let c = indexPrimArray arr i
j' <- encodeChar marr j c
go (i+1) j' marr
toVector :: Text -> V.PrimVector Char
{-# INLINE toVector #-}
toVector (Text (V.PrimVector arr s l)) = V.createN (l*4) (go s 0)
where
end = s+l
go !i !j !marr
| i >= l = return j
| otherwise = do
let (# c, n #) = decodeChar arr i
writePrimArray marr j c
go (i+n) (j+1) marr
isNormalized :: Text -> NormalizationResult
{-# INLINE isNormalized #-}
isNormalized = isNormalizedTo NFC
isNormalizedTo :: NormalizeMode -> Text -> NormalizationResult
isNormalizedTo nmode (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = NormalizedYes
| otherwise =
let nflag = normalizeModeToFlag nmode
in toNormalizationResult (utf8_isnormalized arr# s# l# nflag)
normalize :: Text -> Text
{-# INLINE normalize #-}
normalize = normalizeTo NFC
normalizeTo :: NormalizeMode -> Text -> Text
normalizeTo nmode (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
let nflag = normalizeModeToFlag nmode
l'@(I# l'#) = utf8_normalize_length arr# s# l# nflag
when (l' < 0) (error "impossible happened!")
pa@(MutablePrimArray marr#) <- newArr l'
utf8_normalize arr# s# l# marr# l'# nflag
arr' <- unsafeFreezeArr pa
let !v = V.fromArr arr' 0 l'
return (Text v)
foreign import ccall unsafe utf8_isnormalized ::
BA# Word8 -> Int# -> Int# -> CSize -> Int
foreign import ccall unsafe utf8_normalize ::
BA# Word8 -> Int# -> Int# -> MBA# Word8 -> Int# -> CSize -> IO ()
foreign import ccall unsafe utf8_normalize_length ::
BA# Word8 -> Int# -> Int# -> CSize -> Int
caseFold :: Text -> Text
caseFold = caseFoldWith localeDefault
caseFoldWith :: Locale -> Text -> Text
caseFoldWith locale (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
let l'@(I# l'#) = utf8_casefold_length arr# s# l# locale
when (l' < 0) (error "impossible happened!")
pa@(MutablePrimArray marr#) <- newArr l'
utf8_casefold arr# s# l# marr# l'# locale
arr' <- unsafeFreezeArr pa
let !v = V.fromArr arr' 0 l'
return (Text v)
toLower :: Text -> Text
toLower = toLowerWith localeDefault
toLowerWith :: Locale -> Text -> Text
toLowerWith locale (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
let l'@(I# l'#) = utf8_tolower_length arr# s# l# locale
when (l' < 0) (error "impossible happened!")
pa@(MutablePrimArray marr#) <- newArr l'
utf8_tolower arr# s# l# marr# l'# locale
arr' <- unsafeFreezeArr pa
let !v = V.fromArr arr' 0 l'
return (Text v)
toUpper :: Text -> Text
toUpper = toUpperWith localeDefault
toUpperWith :: Locale -> Text -> Text
toUpperWith locale (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
let l'@(I# l'#) = utf8_toupper_length arr# s# l# locale
when (l' < 0) (error "impossible happened!")
pa@(MutablePrimArray marr#) <- newArr l'
utf8_toupper arr# s# l# marr# l'# locale
arr' <- unsafeFreezeArr pa
let !v = V.fromArr arr' 0 l'
return (Text v)
toTitle :: Text -> Text
toTitle = toTitleWith localeDefault
toTitleWith :: Locale -> Text -> Text
toTitleWith locale (Text (V.PrimVector (PrimArray arr#) (I# s#) l@(I# l#)))
| l == 0 = empty
| otherwise = unsafeDupablePerformIO $ do
let l'@(I# l'#) = utf8_totitle_length arr# s# l# locale
when (l' < 0) (error "impossible happened!")
pa@(MutablePrimArray marr#) <- newArr l'
utf8_totitle arr# s# l# marr# l'# locale
arr' <- unsafeFreezeArr pa
let !v = V.fromArr arr' 0 l'
return (Text v)
foreign import ccall unsafe utf8_casefold ::
BA# Word8 -> Int# -> Int# -> MBA# Word8 -> Int# -> Locale -> IO ()
foreign import ccall unsafe utf8_casefold_length ::
BA# Word8 -> Int# -> Int# -> Locale -> Int
foreign import ccall unsafe utf8_tolower ::
BA# Word8 -> Int# -> Int# -> MBA# Word8 -> Int# -> Locale -> IO ()
foreign import ccall unsafe utf8_tolower_length ::
BA# Word8 -> Int# -> Int# -> Locale -> Int
foreign import ccall unsafe utf8_toupper ::
BA# Word8 -> Int# -> Int# -> MBA# Word8 -> Int# -> Locale -> IO ()
foreign import ccall unsafe utf8_toupper_length ::
BA# Word8 -> Int# -> Int# -> Locale -> Int
foreign import ccall unsafe utf8_totitle ::
BA# Word8 -> Int# -> Int# -> MBA# Word8 -> Int# -> Locale -> IO ()
foreign import ccall unsafe utf8_totitle_length ::
BA# Word8 -> Int# -> Int# -> Locale -> Int
isCategory :: Category -> Text -> Bool
isCategory c (Text (V.PrimVector arr@(PrimArray arr#) s@(I# s#) l@(I# l#)))
| l == 0 = True
| otherwise = utf8_iscategory arr# s# l# c == l
spanCategory :: Category -> Text -> (Text, Text)
spanCategory c (Text (V.PrimVector arr@(PrimArray arr#) s@(I# s#) l@(I# l#)))
| l == 0 = (empty, empty)
| otherwise =
let i = utf8_iscategory arr# s# l# c
in (Text (V.PrimVector arr s i), Text (V.PrimVector arr (s+i) (l-i)))
foreign import ccall utf8_iscategory :: BA# Word8 -> Int# -> Int# -> Category -> Int