{-|
Module      : Z.Data.CBytes
Description : Null-ternimated byte string.
Copyright   : (c) Dong Han, 2017-2018
License     : BSD
Maintainer  : winterland1989@gmail.com
Stability   : experimental
Portability : non-portable

-- This module provide 'CBytes' with some useful instances \/ tools for retrieving, storing or processing
-- short byte sequences, such as file path, environment variables, etc.

-}

module Z.Data.CBytes
  ( CBytes(CB)
  , toPrimArray
  , pack
  , unpack
  , null , length
  , empty, append, concat, intercalate, intercalateElem
  , toBytes, fromBytes, toText, toTextMaybe, fromText, toBuilder, buildCBytes
  , fromCString, fromCStringN
  , withCBytesUnsafe, withCBytes, allocCBytesUnsafe, allocCBytes
  -- re-export
  , CString
  , V.c2w, V.w2c
  ) where

import           Control.DeepSeq
import           Control.Monad
import           Control.Monad.Primitive
import           Control.Monad.ST
import           Data.Bits
import           Data.Foldable           (foldlM)
import           Data.Hashable           (Hashable(..))
import qualified Data.List               as List
import           Data.Primitive.PrimArray
import           Data.Word
import           Foreign.C.String
import           GHC.Exts
import           GHC.CString
import           GHC.Ptr
import           GHC.Stack
import           Prelude                 hiding (all, any, appendFile, break,
                                          concat, concatMap, drop, dropWhile,
                                          elem, filter, foldl, foldl1, foldr,
                                          foldr1, getContents, getLine, head,
                                          init, interact, last, length, lines,
                                          map, maximum, minimum, notElem, null,
                                          putStr, putStrLn, readFile, replicate,
                                          reverse, scanl, scanl1, scanr, scanr1,
                                          span, splitAt, tail, take, takeWhile,
                                          unlines, unzip, writeFile, zip,
                                          zipWith)
import           Z.Data.Array
import           Z.Data.Array.Unaligned
import qualified Z.Data.Builder        as B
import qualified Z.Data.Text           as T
import qualified Z.Data.Text.ShowT     as T
import           Z.Data.Text.UTF8Codec (encodeCharModifiedUTF8, decodeChar)
import qualified Z.Data.Vector.Base    as V
import           Z.Foreign
import           System.IO.Unsafe        (unsafeDupablePerformIO)
import           Test.QuickCheck.Arbitrary (Arbitrary(..), CoArbitrary(..))

-- | A efficient wrapper for short immutable null-terminated byte sequences which can be
-- automatically freed by ghc garbage collector.
--
-- The main use case of this type is to ease the bridging of C FFI APIs, since most
-- of the unix APIs use null-terminated string. On windows you're encouraged to use a
-- compatibility layer like 'WideCharToMultiByte/MultiByteToWideChar' and keep the same
-- interface, e.g. libuv do this when deal with file paths.
--
-- 'CBytes' don't support O(1) slicing, it's not suitable to use it to store large byte
-- chunk, If you need advance editing, convert 'CBytes' to \/ from 'PrimArray' with 'CB',
-- or 'V.Bytes' with 'toBytes\/fromBytes' if you need O(1) slicing, then use vector combinators.
--
-- When textual represatation is needed(conver to 'String', 'T.Text', 'Show' instance, etc.),
-- we assume 'CBytes' using UTF-8 encodings, 'CBytes' can be used with @OverloadedString@,
-- literal encoding is UTF-8 with some modifications: @\NUL@ is encoded to 'C0 80',
-- and '\xD800' ~ '\xDFFF' is encoded as a three bytes normal utf-8 codepoint.
--
-- Note most of the unix API is not unicode awared though, you may find a `scandir` call
-- return a filename which is not proper encoded in any unicode encoding at all.
-- But still, UTF-8 is recommanded to be used when text represatation is needed.
-- --
data CBytes = CBytes
    {
        -- | Convert to 'PrimArray',
        --
        -- there's an invariance that this array never contains @\NUL@
        toPrimArray :: {-# UNPACK #-} !(PrimArray Word8)
    }

-- | Use this pattern to match or construct 'CBytes', result will be trimmed down to first byte before @\NUL@ byte if there's any.
pattern CB :: PrimArray Word8 -> CBytes
pattern CB arr <- CBytes arr where
    CB arr = fromPrimArray arr

fromPrimArray :: PrimArray Word8 -> CBytes
{-# INLINE fromPrimArray #-}
fromPrimArray arr = runST (
    case V.elemIndex 0 arr of
        Just i -> do
            mpa <- newPrimArray i
            copyPrimArray mpa 0 arr 0 i
            pa <- unsafeFreezePrimArray mpa
            return (CBytes pa)
        _ -> return (CBytes arr))

instance Show CBytes where
    showsPrec p t = showsPrec p (unpack t)

instance Read CBytes where
    readsPrec p s = [(pack x, r) | (x, r) <- readsPrec p s]

instance NFData CBytes where
    {-# INLINE rnf #-}
    rnf (CBytes _) = ()

instance Eq CBytes where
    {-# INLINE (==) #-}
    CBytes ba == CBytes bb = ba == bb

instance Ord CBytes where
    {-# INLINE compare #-}
    CBytes ba `compare` CBytes bb = ba `compare` bb

instance Semigroup CBytes where
    (<>) = append

instance Monoid CBytes where
    {-# INLINE mempty #-}
    mempty  = empty
    {-# INLINE mappend #-}
    mappend = append
    {-# INLINE mconcat #-}
    mconcat = concat

instance Hashable CBytes where
    hashWithSalt salt (CBytes pa@(PrimArray ba#)) = unsafeDupablePerformIO $ do
        V.c_fnv_hash_ba ba# 0 (sizeofPrimArray pa) salt

instance Arbitrary CBytes where
    arbitrary = pack <$> arbitrary
    shrink a = pack <$> shrink (unpack a)

instance CoArbitrary CBytes where
    coarbitrary = coarbitrary . unpack

-- | This instance peek bytes until @\NUL@(or input chunk ends), poke bytes with an extra \NUL terminator.
instance Unaligned CBytes where
    {-# INLINE unalignedSize #-}
    unalignedSize (CBytes arr) = sizeofPrimArray arr + 1
    {-# INLINE peekMBA #-}
    peekMBA mba# i = do
        b <- getSizeofMutableByteArray (MutableByteArray mba#)
        let rest = b-i
        l <- c_memchr mba# i 0 rest
        let l' = if l == -1 then rest else l
        mpa <- newPrimArray l'
        copyMutablePrimArray mpa 0 (MutablePrimArray mba#) i l'
        pa <- unsafeFreezePrimArray mpa
        return (CBytes pa)

    {-# INLINE pokeMBA #-}
    pokeMBA mba# i (CBytes pa) = do
        let l = sizeofPrimArray pa
        copyPrimArray (MutablePrimArray mba# :: MutablePrimArray RealWorld Word8) i pa 0 l
        writePrimArray (MutablePrimArray mba# :: MutablePrimArray RealWorld Word8) (i+l) 0

    {-# INLINE indexBA #-}
    indexBA ba# i = runST (do
        let b = sizeofByteArray (ByteArray ba#)
            rest = b-i
            l = V.c_memchr ba# i 0 rest
            l' = if l == -1 then rest else l
        mpa <- newPrimArray l'
        copyPrimArray mpa 0 (PrimArray ba#) i l'
        pa <- unsafeFreezePrimArray mpa
        return (CBytes pa))

-- | This instance provide UTF8 guarantee, illegal codepoints will be written as 'T.replacementChar's.
instance T.ShowT CBytes where
    {-# INLINE toTextBuilder #-}
    toTextBuilder _ = T.stringUTF8 . show . unpack

append :: CBytes -> CBytes -> CBytes
{-# INLINABLE append #-}
append strA@(CBytes pa) strB@(CBytes pb)
    | lenA == 0 = strB
    | lenB == 0 = strA
    | otherwise = unsafeDupablePerformIO $ do
        mpa <- newPrimArray (lenA+lenB)
        copyPrimArray mpa 0    pa 0 lenA
        copyPrimArray mpa lenA pb 0 lenB
        pa' <- unsafeFreezePrimArray mpa
        return (CBytes pa')
  where
    lenA = length strA
    lenB = length strB

-- | An empty 'CBytes'
empty :: CBytes
{-# NOINLINE empty #-}
empty = CBytes (V.empty)

concat :: [CBytes] -> CBytes
{-# INLINABLE concat #-}
concat bss = case pre 0 0 bss of
    (0, _) -> empty
    (1, _) -> let Just b = List.find (not . null) bss in b -- there must be a not empty CBytes
    (_, l) -> runST $ do
        buf <- newPrimArray l
        copy bss 0 buf
        CBytes <$> unsafeFreezePrimArray buf
  where
    -- pre scan to decide if we really need to copy and calculate total length
    -- we don't accumulate another result list, since it's rare to got empty
    pre :: Int -> Int -> [CBytes] -> (Int, Int)
    pre !nacc !lacc [] = (nacc, lacc)
    pre !nacc !lacc (b:bs)
        | l <= 0 = pre nacc lacc bs
        | otherwise     = pre (nacc+1) (l + lacc) bs
      where !l = length b

    copy :: [CBytes] -> Int -> MutablePrimArray s Word8 -> ST s ()
    copy [] !_ !_       = return ()
    copy (b@(CBytes ba):bs) !i !mba = do
        let l = length b
        when (l /= 0) (copyPrimArray mba i ba 0 l)
        copy bs (i+l) mba

-- | /O(n)/ The 'intercalate' function takes a 'CBytes' and a list of
-- 'CBytes' s and concatenates the list after interspersing the first
-- argument between each element of the list.
--
-- Note: 'intercalate' will force the entire 'CBytes' list.
--
intercalate :: CBytes -> [CBytes] -> CBytes
{-# INLINE intercalate #-}
intercalate s = concat . List.intersperse s

-- | /O(n)/ An efficient way to join 'CByte' s with a byte.
--
-- Intercalate bytes list with @\NUL@ will effectively leave the first bytes in the list.
intercalateElem :: Word8 -> [CBytes] -> CBytes
{-# INLINABLE intercalateElem #-}
intercalateElem 0 [] = empty
intercalateElem 0 (bs:_) = bs
intercalateElem w8 bss = case len bss 0 of
    0 -> empty
    l -> runST $ do
        buf <- newPrimArray l
        copy bss 0 buf
        CBytes <$> unsafeFreezePrimArray buf
  where
    len []     !acc = acc
    len [b]    !acc = length b + acc
    len (b:bs) !acc = len bs (acc + length b + 1)
    copy :: [CBytes] -> Int -> MutablePrimArray s Word8 -> ST s ()
    -- bss must not be empty, which is checked by len above
    copy (b@(CBytes ba):bs) !i !mba = do
        let l = length b
        when (l /= 0) (copyPrimArray mba i ba 0 l)
        case bs of
            [] -> return () -- last one
            _  -> do
                let i' = i + l
                writePrimArray mba i' w8
                copy bs (i'+1) mba

instance IsString CBytes where
    {-# INLINE fromString #-}
    fromString = pack

{-# RULES
    "CBytes pack/unpackCString#" forall addr# .
        pack (unpackCString# addr#) = packAddr addr#
 #-}
{-# RULES
    "CBytes pack/unpackCStringUtf8#" forall addr# .
        pack (unpackCStringUtf8# addr#) = packAddr addr#
 #-}

packAddr :: Addr# -> CBytes
packAddr addr0# = go addr0#
  where
    len = (fromIntegral . unsafeDupablePerformIO $ V.c_strlen addr0#)
    go addr# = runST $ do
        marr <- newPrimArray len
        copyPtrToMutablePrimArray marr 0 (Ptr addr#) len
        arr <- unsafeFreezePrimArray marr
        return (CBytes arr)

-- | Pack a 'String' into 'CBytes'.
--
-- @\NUL@ is encoded as two bytes @C0 80@ , '\xD800' ~ '\xDFFF' is encoded as a three bytes normal UTF-8 codepoint.
pack :: String -> CBytes
{-# INLINE CONLIKE [1] pack #-}
pack s = runST $ do
    mba <- newPrimArray V.defaultInitSize
    (SP2 i mba') <- foldlM go (SP2 0 mba) s
    shrinkMutablePrimArray mba' i
    ba <- unsafeFreezePrimArray mba'
    return (CBytes ba)
  where
    -- It's critical that this function get specialized and unboxed
    -- Keep an eye on its core!
    go :: SP2 s -> Char -> ST s (SP2 s)
    go (SP2 i mba) !c     = do
        siz <- getSizeofMutablePrimArray mba
        if i < siz - 3  -- we need at least 4 bytes for safety
        then do
            i' <- encodeCharModifiedUTF8 mba i c
            return (SP2 i' mba)
        else do
            let !siz' = siz `shiftL` 1
            !mba' <- resizeMutablePrimArray mba siz'
            i' <- encodeCharModifiedUTF8 mba' i c
            return (SP2 i' mba')


data SP2 s = SP2 {-# UNPACK #-}!Int {-# UNPACK #-}!(MutablePrimArray s Word8)

-- | /O(n)/ Convert cbytes to a char list using UTF8 encoding assumption.
--
-- This function is much tolerant than 'toText', it simply decoding codepoints using UTF8 'decodeChar'
-- without checking errors such as overlong or invalid range.
--
-- Unpacking is done lazily. i.e. we will retain reference to the array until all element are consumed.
--
-- This function is a /good producer/ in the sense of build/foldr fusion.
unpack :: CBytes -> String
{-# INLINE [1] unpack #-}
unpack (CBytes arr) = go 0
  where
    !end = sizeofPrimArray arr
    go !idx
        | idx >= end = []
        | otherwise = let (# c, i #) = decodeChar arr idx in c : go (idx + i)

unpackFB :: CBytes -> (Char -> a -> a) -> a -> a
{-# INLINE [0] unpackFB #-}
unpackFB (CBytes arr) k z = go 0
  where
    !end = sizeofPrimArray arr
    go !idx
        | idx >= end = z
        | otherwise = let (# c, i #) = decodeChar arr idx in c `k` go (idx + i)

{-# RULES
"unpack" [~1] forall t . unpack t = build (\ k z -> unpackFB t k z)
"unpackFB" [1] forall t . unpackFB t (:) [] = unpack t
 #-}

--------------------------------------------------------------------------------

-- | Return 'True' if 'CBytes' is empty.
--
null :: CBytes -> Bool
{-# INLINE null #-}
null (CBytes pa) = sizeofPrimArray pa == 0

-- | Return the BTYE length of 'CBytes'.
--
length :: CBytes -> Int
{-# INLINE length #-}
length (CBytes pa) = sizeofPrimArray pa

-- | /O(1)/, convert to 'V.Bytes', which can be processed by vector combinators.
toBytes :: CBytes -> V.Bytes
{-# INLINABLE toBytes #-}
toBytes (CBytes arr) = V.PrimVector arr 0 (sizeofPrimArray arr)

-- | /O(n)/, convert from 'V.Bytes'
--
-- Result will be trimmed down to first byte before @\NUL@ byte if there's any.
fromBytes :: V.Bytes -> CBytes
{-# INLINABLE fromBytes #-}
fromBytes v@(V.PrimVector arr s l) = runST (do
    case V.elemIndex 0 v of
        Just i -> do
            mpa <- newPrimArray i
            copyPrimArray mpa 0 arr s i
            pa <- unsafeFreezePrimArray mpa
            return (CBytes pa)
        _ | s == 0 && sizeofPrimArray arr == l -> return (CBytes arr)
          | otherwise -> do
                mpa <- newPrimArray l
                copyPrimArray mpa 0 arr s l
                pa <- unsafeFreezePrimArray mpa
                return (CBytes pa))

-- | /O(n)/, convert to 'T.Text' using UTF8 encoding assumption.
--
-- Throw 'T.InvalidUTF8Exception' in case of invalid codepoint.
toText :: CBytes -> T.Text
{-# INLINABLE toText #-}
toText = T.validate . toBytes

-- | /O(n)/, convert to 'T.Text' using UTF8 encoding assumption.
--
-- Return 'Nothing' in case of invalid codepoint.
toTextMaybe :: CBytes -> Maybe T.Text
{-# INLINABLE toTextMaybe #-}
toTextMaybe = T.validateMaybe . toBytes

-- | /O(n)/, convert from 'T.Text',
--
-- Result will be trimmed down to first byte before @\NUL@ byte if there's any.
fromText :: T.Text -> CBytes
{-# INLINABLE fromText #-}
fromText = fromBytes . T.getUTF8Bytes


-- | Write 'CBytes' \'s byte sequence to buffer.
--
-- This function is different from 'ShowT' instance in that it directly write byte sequence without
-- checking if it's UTF8 encoded.
toBuilder :: CBytes -> B.Builder ()
toBuilder = B.bytes . toBytes

-- | Build a 'CBytes' with builder, result will be trimmed down to first byte before @\NUL@ byte if there's any.
buildCBytes :: B.Builder a -> CBytes
buildCBytes = fromBytes . B.buildBytes

--------------------------------------------------------------------------------

-- | Copy a 'CString' type into a 'CBytes', return 'empty' if the pointer is NULL.
--
--  After copying you're free to free the 'CString' 's memory.
fromCString :: CString -> IO CBytes
{-# INLINABLE fromCString #-}
fromCString cstring = do
    if cstring == nullPtr
    then return empty
    else do
        len <- fromIntegral <$> c_strlen_ptr cstring
        mpa <- newPrimArray len
        copyPtrToMutablePrimArray mpa 0 (castPtr cstring) len
        pa <- unsafeFreezePrimArray mpa
        return (CBytes pa)

-- | Same with 'fromCString', but only take at most N bytes.
--
-- Result will be trimmed down to first byte before @\NUL@ byte if there's any.
fromCStringN :: CString -> Int -> IO CBytes
{-# INLINABLE fromCStringN #-}
fromCStringN cstring len0 = do
    if cstring == nullPtr || len0 == 0
    then return empty
    else do
        len1 <- fromIntegral <$> c_strlen_ptr cstring
        let len = min len0 len1
        mpa <- newPrimArray len
        copyPtrToMutablePrimArray mpa 0 (castPtr cstring) len
        pa <- unsafeFreezePrimArray mpa
        return (CBytes pa)

-- | Pass 'CBytes' to foreign function as a @const char*@.
--
-- USE THIS FUNCTION WITH UNSAFE FFI CALL ONLY.
withCBytesUnsafe :: CBytes -> (BA# Word8 -> IO a) -> IO a
{-# INLINABLE withCBytesUnsafe #-}
withCBytesUnsafe (CBytes pa) f = do
    let l = sizeofPrimArray pa
    mpa <- newPrimArray (l+1)
    copyPrimArray mpa 0 pa 0 l
    writePrimArray mpa l 0
    pa' <- unsafeFreezePrimArray mpa
    withPrimArrayUnsafe pa' (\ p _ -> f p)

-- | Pass 'CBytes' to foreign function as a @const char*@.
--
-- Don't pass a forever loop to this function, see <https://ghc.haskell.org/trac/ghc/ticket/14346 #14346>.
withCBytes :: CBytes -> (Ptr Word8 -> IO a) -> IO a
{-# INLINABLE withCBytes #-}
withCBytes (CBytes pa) f = do
    let l = sizeofPrimArray pa
    mpa <- newPinnedPrimArray (l+1)
    copyPrimArray mpa 0 pa 0 l
    writePrimArray mpa l 0
    pa' <- unsafeFreezePrimArray mpa
    withPrimArraySafe pa' (\ p _ -> f p)

-- | Create a 'CBytes' with IO action.
--
-- If (<=0) capacity is provided, a pointer pointing to @\NUL@ is passed to initialize function
-- and 'empty' will be returned. This behavior is different from 'allocCBytes', which may cause
-- trouble for some FFI functions.
--
-- USE THIS FUNCTION WITH UNSAFE FFI CALL ONLY.
allocCBytesUnsafe :: HasCallStack
                  => Int                   -- ^ capacity n(include the @\NUL@ terminator)
                  -> (MBA# Word8 -> IO a)  -- ^ initialization function,
                  -> IO (CBytes, a)
{-# INLINABLE allocCBytesUnsafe #-}
allocCBytesUnsafe n fill | n <= 0 = withPrimUnsafe (0::Word8) fill >>=
                                        \ (_, b) -> return (empty, b)
                         | otherwise = do
    mba@(MutablePrimArray mba#) <- newPrimArray n :: IO (MutablePrimArray RealWorld Word8)
    a <- fill mba#
    l <- fromIntegral <$> (c_memchr mba# 0 0 n)
    shrinkMutablePrimArray mba (if l == -1 then n else l)
    bs <- unsafeFreezePrimArray mba
    return (CBytes bs, a)


-- | Create a 'CBytes' with IO action.
--
-- If (<=0) capacity is provided, a 'nullPtr' is passed to initialize function and
-- 'empty' will be returned. Other than that, User have to make sure a @\NUL@ ternimated
-- string will be written.
allocCBytes :: HasCallStack
            => Int                -- ^ capacity n(include the @\NUL@ terminator)
            -> (CString -> IO a)  -- ^ initialization function,
            -> IO (CBytes, a)
{-# INLINABLE allocCBytes #-}
allocCBytes n fill | n <= 0 = fill nullPtr >>= \ a -> return (empty, a)
                   | otherwise = do
    mba@(MutablePrimArray mba#) <- newPinnedPrimArray n :: IO (MutablePrimArray RealWorld Word8)
    a <- withMutablePrimArrayContents mba (fill . castPtr)
    l <- fromIntegral <$> (c_memchr mba# 0 0 n)
    shrinkMutablePrimArray mba (if l == -1 then n else l)
    bs <- unsafeFreezePrimArray mba
    return (CBytes bs, a)

--------------------------------------------------------------------------------

c_strlen_ptr :: CString -> IO CSize
{-# INLINE c_strlen_ptr #-}
c_strlen_ptr (Ptr a#) = V.c_strlen a#

-- HsInt hs_memchr(uint8_t *a, HsInt aoff, uint8_t b, HsInt n);
foreign import ccall unsafe "hs_memchr" c_memchr :: MBA# Word8 -> Int -> Word8 -> Int -> IO Int