{-# LANGUAGE MagicHash #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE CPP #-} {-# LANGUAGE UnliftedFFITypes #-} {-# LANGUAGE DeriveDataTypeable #-} {-# LANGUAGE BangPatterns #-} {-| Module : Z.Data.CBytes Description : Null-ternimated byte string. Copyright : (c) Dong Han, 2017-2018 License : BSD Maintainer : winterland1989@gmail.com Stability : experimental Portability : non-portable This module provide 'CBytes' with some useful instances \/ functions, A 'CBytes' is a wrapper for immutable null-terminated string. The main design target of this type is to ease the bridging of C FFI APIs, since most of the unix APIs use null-terminated string. On windows you're encouraged to use a compatibility layer like 'WideCharToMultiByte/MultiByteToWideChar' and keep the same interface, e.g. libuv do this when deal with file paths. We neither guarantee to store length info, nor support O(1) slice for 'CBytes': This will defeat the purpose of null-terminated string which is to save memory, We do save the length if it's created on GHC heap though. If you need advance editing, convert a 'CBytes' to 'V.Bytes' with 'toBytes' and use vector combinators. Use 'fromBytes' to convert it back. It can be used with @OverloadedString@, literal encoding is UTF-8 with some modifications: @\NUL@ char is encoded to 'C0 80', and '\xD800' ~ '\xDFFF' is encoded as a three bytes normal utf-8 codepoint. This is also how ghc compile string literal into binaries, thus we can use rewrite-rules to construct 'CBytes' value in O(1) without wasting runtime heap. Note most of the unix API is not unicode awared though, you may find a `scandir` call return a filename which is not proper encoded in any unicode encoding at all. But still, UTF-8 is recommanded to be used everywhere, and we use UTF-8 assumption in various places, such as displaying 'CBytes' and literals encoding above. -} module Z.Data.CBytes ( CBytes , create , pack , unpack , null , length , empty, append, concat, intercalate, intercalateElem , toBytes, fromBytes, toText, toTextMaybe, fromText , fromCStringMaybe, fromCString, fromCStringN , withCBytes -- helpers re-export , V.w2c, V.c2w -- * exception , NullPointerException(..) ) where import Control.DeepSeq import Control.Exception (Exception, throwIO) import Control.Monad import Control.Monad.Primitive import Control.Monad.ST import Data.Bits import Data.Foldable (foldlM) import Data.Hashable (Hashable(..)) import qualified Data.List as List import Data.String (IsString (..)) import Data.Typeable import Data.Primitive.PrimArray import Data.Word import Foreign.C import Foreign.Storable (peekElemOff) import GHC.CString import GHC.Ptr import GHC.Stack import Prelude hiding (all, any, appendFile, break, concat, concatMap, drop, dropWhile, elem, filter, foldl, foldl1, foldr, foldr1, getContents, getLine, head, init, interact, last, length, lines, map, maximum, minimum, notElem, null, putStr, putStrLn, readFile, replicate, reverse, scanl, scanl1, scanr, scanr1, span, splitAt, tail, take, takeWhile, unlines, unzip, writeFile, zip, zipWith) import Z.Data.Array import qualified Z.Data.Text as T import Z.Data.Text.UTF8Codec (encodeCharModifiedUTF8) import qualified Z.Data.Vector.Base as V import System.IO.Unsafe (unsafeDupablePerformIO) -- | A efficient wrapper for immutable null-terminated string which can be -- automatically freed by ghc garbage collector. -- data CBytes = CBytesOnHeap {-# UNPACK #-} !(PrimArray Word8) -- ^ On heap pinned 'PrimArray' -- there's an invariance that this array's -- length is always shrinked to contain content -- and \NUL terminator | CBytesLiteral {-# UNPACK #-} !CString -- ^ String literals with static address -- | Create a 'CBytes' with IO action. -- -- User only have to do content initialization and return the content length, -- 'create' takes the responsibility to add the '\NUL' ternimator. create :: HasCallStack => Int -- ^ capacity n, including the '\NUL' terminator -> (CString -> IO Int) -- ^ initialization function, -- write the pointer, return the length (<= n-1) -> IO CBytes {-# INLINE create #-} create n fill = do mba <- newPinnedPrimArray n :: IO (MutablePrimArray RealWorld Word8) l <- withMutablePrimArrayContents mba (fill . castPtr) writePrimArray mba l 0 -- the '\NUL' ternimator shrinkMutablePrimArray mba (l+1) CBytesOnHeap <$> unsafeFreezePrimArray mba instance Show CBytes where show = unpack instance Read CBytes where readsPrec p s = [(pack x, r) | (x, r) <- readsPrec p s] instance NFData CBytes where {-# INLINE rnf #-} rnf (CBytesOnHeap _) = () rnf (CBytesLiteral _) = () instance Eq CBytes where {-# INLINE (==) #-} cbyteA == cbyteB = unsafeDupablePerformIO $ withCBytes cbyteA $ \ pA -> withCBytes cbyteB $ \ pB -> if pA == pB then return True else do r <- c_strcmp pA pB return (r == 0) instance Ord CBytes where {-# INLINE compare #-} cbyteA `compare` cbyteB = unsafeDupablePerformIO $ withCBytes cbyteA $ \ pA -> withCBytes cbyteB $ \ pB -> if pA == pB then return EQ else do r <- c_strcmp pA pB return (r `compare` 0) instance Semigroup CBytes where (<>) = append instance Monoid CBytes where {-# INLINE mempty #-} mempty = empty {-# INLINE mappend #-} mappend = append {-# INLINE mconcat #-} mconcat = concat instance Hashable CBytes where hashWithSalt salt (CBytesOnHeap pa@(PrimArray ba#)) = unsafeDupablePerformIO $ do V.c_fnv_hash_ba ba# 0 (sizeofPrimArray pa - 1) salt hashWithSalt salt (CBytesLiteral p@(Ptr addr#)) = unsafeDupablePerformIO $ do len <- c_strlen p V.c_fnv_hash_addr addr# (fromIntegral len) salt append :: CBytes -> CBytes -> CBytes {-# INLINABLE append #-} append strA strB | lenA == 0 = strB | lenB == 0 = strA | otherwise = unsafeDupablePerformIO $ do mpa <- newPinnedPrimArray (lenA+lenB+1) withCBytes strA $ \ pa -> withCBytes strB $ \ pb -> do copyPtrToMutablePrimArray mpa 0 (castPtr pa) lenA copyPtrToMutablePrimArray mpa lenA (castPtr pb) lenB writePrimArray mpa (lenA + lenB) 0 -- the \NUL terminator pa' <- unsafeFreezePrimArray mpa return (CBytesOnHeap pa') where lenA = length strA lenB = length strB empty :: CBytes {-# NOINLINE empty #-} empty = CBytesLiteral (Ptr "\0"#) concat :: [CBytes] -> CBytes {-# INLINABLE concat #-} concat bss = case pre 0 0 bss of (0, _) -> empty (1, _) -> let Just b = List.find (not . null) bss in b -- there must be a not empty CBytes (_, l) -> runST $ do buf <- newPinnedPrimArray (l+1) copy bss 0 buf writePrimArray buf l 0 -- the \NUL terminator CBytesOnHeap <$> unsafeFreezePrimArray buf where -- pre scan to decide if we really need to copy and calculate total length -- we don't accumulate another result list, since it's rare to got empty pre :: Int -> Int -> [CBytes] -> (Int, Int) pre !nacc !lacc [] = (nacc, lacc) pre !nacc !lacc (b:bs) | l <= 0 = pre nacc lacc bs | otherwise = pre (nacc+1) (l + lacc) bs where !l = length b copy :: [CBytes] -> Int -> MutablePrimArray s Word8 -> ST s () copy [] !_ !_ = return () copy (b:bs) !i !mba = do let l = length b when (l /= 0) (case b of CBytesOnHeap ba -> copyPrimArray mba i ba 0 l CBytesLiteral p -> copyPtrToMutablePrimArray mba i (castPtr p) l) copy bs (i+l) mba -- | /O(n)/ The 'intercalate' function takes a 'CBytes' and a list of -- 'CBytes' s and concatenates the list after interspersing the first -- argument between each element of the list. -- -- Note: 'intercalate' will force the entire 'CBytes' list. -- intercalate :: CBytes -> [CBytes] -> CBytes {-# INLINE intercalate #-} intercalate s = concat . List.intersperse s -- | /O(n)/ An efficient way to join 'CByte' s with a byte. -- intercalateElem :: Word8 -> [CBytes] -> CBytes {-# INLINABLE intercalateElem #-} intercalateElem w8 bss = case len bss 0 of 0 -> empty l -> runST $ do buf <- newPinnedPrimArray (l+1) copy bss 0 buf writePrimArray buf l 0 -- the \NUL terminator CBytesOnHeap <$> unsafeFreezePrimArray buf where len [] !acc = acc len [b] !acc = length b + acc len (b:bs) !acc = len bs (acc + length b + 1) copy :: [CBytes] -> Int -> MutablePrimArray s Word8 -> ST s () -- bss must not be empty, which is checked by len above copy (b:bs) !i !mba = do let l = length b when (l /= 0) (case b of CBytesOnHeap ba -> copyPrimArray mba i ba 0 l CBytesLiteral p -> copyPtrToMutablePrimArray mba i (castPtr p) l) case bs of [] -> return () -- last one _ -> do let i' = i + l writePrimArray mba i' w8 copy bs (i'+1) mba instance IsString CBytes where {-# INLINE fromString #-} fromString = pack {-# RULES "CBytes pack/unpackCString#" forall addr# . pack (unpackCString# addr#) = CBytesLiteral (Ptr addr#) #-} {-# RULES "CBytes pack/unpackCStringUtf8#" forall addr# . pack (unpackCStringUtf8# addr#) = CBytesLiteral (Ptr addr#) #-} -- | Pack a 'String' into null-terminated 'CBytes'. -- -- '\NUL' is encoded as two bytes @C0 80@ , '\xD800' ~ '\xDFFF' is encoded as a three bytes normal UTF-8 codepoint. pack :: String -> CBytes {-# INLINE CONLIKE [1] pack #-} pack s = runST $ do mba <- newPinnedPrimArray V.defaultInitSize (SP2 i mba') <- foldlM go (SP2 0 mba) s writePrimArray mba' i 0 -- the \NUL terminator shrinkMutablePrimArray mba' (i+1) ba <- unsafeFreezePrimArray mba' return (CBytesOnHeap ba) where -- It's critical that this function get specialized and unboxed -- Keep an eye on its core! go :: SP2 s -> Char -> ST s (SP2 s) go (SP2 i mba) !c = do siz <- getSizeofMutablePrimArray mba if i < siz - 4 -- we need at least 5 bytes for safety due to extra '\0' byte then do i' <- encodeCharModifiedUTF8 mba i c return (SP2 i' mba) else do let !siz' = siz `shiftL` 1 !mba' <- resizeMutablePrimArray mba siz' i' <- encodeCharModifiedUTF8 mba' i c return (SP2 i' mba') data SP2 s = SP2 {-# UNPACK #-}!Int {-# UNPACK #-}!(MutablePrimArray s Word8) unpack :: CBytes -> String {-# INLINABLE unpack #-} -- TODO: rewrite with our own decoder unpack cbytes = unsafeDupablePerformIO . withCBytes cbytes $ \ (Ptr addr#) -> return (unpackCStringUtf8# addr#) -------------------------------------------------------------------------------- null :: CBytes -> Bool {-# INLINE null #-} null (CBytesOnHeap pa) = indexPrimArray pa 0 == 0 null (CBytesLiteral p) = unsafeDupablePerformIO (peekElemOff p 0) == 0 length :: CBytes -> Int {-# INLINE length #-} length (CBytesOnHeap pa) = sizeofPrimArray pa - 1 length (CBytesLiteral p) = fromIntegral $ unsafeDupablePerformIO (c_strlen p) -- | /O(1)/, (/O(n)/ in case of literal), convert to 'V.Bytes', which can be -- processed by vector combinators. -- -- NOTE: the '\NUL' ternimator is not included. toBytes :: CBytes -> V.Bytes {-# INLINABLE toBytes #-} toBytes cbytes@(CBytesOnHeap pa) = V.PrimVector pa 0 l where l = length cbytes toBytes cbytes@(CBytesLiteral p) = V.create (l+1) (\ mpa -> do copyPtrToMutablePrimArray mpa 0 (castPtr p) l writePrimArray mpa l 0) -- the \NUL terminator where l = length cbytes -- | /O(n)/, convert from 'V.Bytes', allocate pinned memory and -- add the '\NUL' ternimator fromBytes :: V.Bytes -> CBytes {-# INLINABLE fromBytes #-} fromBytes (V.Vec arr s l) = runST (do mpa <- newPinnedPrimArray (l+1) copyPrimArray mpa 0 arr s l writePrimArray mpa l 0 -- the \NUL terminator pa <- unsafeFreezePrimArray mpa return (CBytesOnHeap pa)) -- | /O(n)/, convert to 'T.Text' using UTF8 encoding assumption. -- -- Throw 'T.InvalidUTF8Exception' in case of invalid codepoint. toText :: CBytes -> T.Text {-# INLINABLE toText #-} toText = T.validate . toBytes -- | /O(n)/, convert to 'T.Text' using UTF8 encoding assumption. -- -- Return 'Nothing' in case of invalid codepoint. toTextMaybe :: CBytes -> Maybe T.Text {-# INLINABLE toTextMaybe #-} toTextMaybe = T.validateMaybe . toBytes -- | /O(n)/, convert from 'T.Text', allocate pinned memory and -- add the '\NUL' ternimator fromText :: T.Text -> CBytes {-# INLINABLE fromText #-} fromText = fromBytes . T.getUTF8Bytes -------------------------------------------------------------------------------- -- | Copy a 'CString' type into a 'CBytes', return Nothing if the pointer is NULL. -- -- After copying you're free to free the 'CString' 's memory. -- fromCStringMaybe :: HasCallStack => CString -> IO (Maybe CBytes) {-# INLINABLE fromCStringMaybe #-} fromCStringMaybe cstring = if cstring == nullPtr then return Nothing else do len <- fromIntegral <$> c_strlen cstring mpa <- newPinnedPrimArray (len+1) copyPtrToMutablePrimArray mpa 0 (castPtr cstring) len writePrimArray mpa len 0 -- the \NUL terminator pa <- unsafeFreezePrimArray mpa return (Just (CBytesOnHeap pa)) -- | Same with 'fromCStringMaybe', but throw 'NullPointerException' when meet a null pointer. -- fromCString :: HasCallStack => CString -> IO CBytes {-# INLINABLE fromCString #-} fromCString cstring = do if cstring == nullPtr then throwIO (NullPointerException callStack) else do len <- fromIntegral <$> c_strlen cstring mpa <- newPinnedPrimArray (len+1) copyPtrToMutablePrimArray mpa 0 (castPtr cstring) len writePrimArray mpa len 0 -- the \NUL terminator pa <- unsafeFreezePrimArray mpa return (CBytesOnHeap pa) -- | Same with 'fromCString', but only take N bytes (and append a null byte as terminator). -- fromCStringN :: HasCallStack => CString -> Int -> IO CBytes {-# INLINABLE fromCStringN #-} fromCStringN cstring len = do if cstring == nullPtr then throwIO (NullPointerException callStack) else do mpa <- newPinnedPrimArray (len+1) copyPtrToMutablePrimArray mpa 0 (castPtr cstring) len writePrimArray mpa len 0 -- the \NUL terminator pa <- unsafeFreezePrimArray mpa return (CBytesOnHeap pa) data NullPointerException = NullPointerException CallStack deriving (Show, Typeable) instance Exception NullPointerException -- | Pass 'CBytes' to foreign function as a @const char*@. -- -- Don't pass a forever loop to this function, see . withCBytes :: CBytes -> (CString -> IO a) -> IO a {-# INLINABLE withCBytes #-} withCBytes (CBytesOnHeap pa) f = withPrimArrayContents pa (f . castPtr) withCBytes (CBytesLiteral ptr) f = f ptr -------------------------------------------------------------------------------- c_strcmp :: CString -> CString -> IO CInt {-# INLINE c_strcmp #-} c_strcmp (Ptr a#) (Ptr b#) = V.c_strcmp a# b# c_strlen :: CString -> IO CSize {-# INLINE c_strlen #-} c_strlen (Ptr a#) = V.c_strlen a#