{-# LANGUAGE CPP, GeneralizedNewtypeDeriving, MagicHash, UnliftedFFITypes, Trustworthy #-}

-- |
-- Module      : Data.Text.Short
-- Copyright   : © Herbert Valerio Riedel 2017
-- License     : BSD3
--
-- Maintainer  : hvr@gnu.org
-- Stability   : stable
--
-- Memory-efficient representation of Unicode text strings.
module Data.Text.Short
    ( -- * The 'ShortText' type
      ShortText

      -- * Basic operations
    , Data.Text.Short.null
    , Data.Text.Short.length
    , Data.Text.Short.isAscii

      -- * Conversions
      -- ** 'String'
    , Data.Text.Short.fromString
    , toString

      -- ** 'T.Text'
    , fromText
    , toText

      -- ** 'BS.ByteString'
    , fromShortByteString
    , toShortByteString

    , fromByteString
    , toByteString

    , toBuilder

    ) where

import           Control.DeepSeq (NFData)
-- import           Control.Exception as E
import qualified Data.ByteString as BS
import           Data.ByteString.Short (ShortByteString)
import qualified Data.ByteString.Short as BSS
import qualified Data.ByteString.Short.Internal as BSSI
import           Data.Char
import           Data.Hashable (Hashable)
import           Data.Semigroup
import qualified Data.String as S
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import           Foreign.C
import           GHC.Exts (ByteArray#)
import qualified GHC.Foreign as GHC
import           GHC.IO.Encoding
import           System.IO.Unsafe
import Data.Binary
import qualified Data.ByteString.Builder as BB

-- | A compact representation of Unicode strings.
--
-- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information.
--
-- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 2 or 4 bytes per code-point (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory.
--
-- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1/2/3/4 bytes per code-point (due to the internal UTF-8 representation).
-- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16 has a space overhead of 50% over UTF-8>.
--
newtype ShortText = ShortText ShortByteString
                  deriving (Eq,Ord,Monoid,Semigroup,Hashable,NFData)

instance Show ShortText where
    showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b)
    show (ShortText b)        = show        (decodeStringShort' utf8 b)

instance Read ShortText where
    readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p

-- | Behaviour for @[U+D800 .. U+DFFF]@ matches the 'IsString' instance for 'T.Text'
instance S.IsString ShortText where
    fromString = fromString

-- | The 'Binary' encoding matches the one for 'T.Text'
#if MIN_VERSION_binary(0,8,1)
instance Binary ShortText where
    put = put . toShortByteString
    get = do
        sbs <- get
        case fromShortByteString sbs of
          Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream"
          Just st -> return st
#else
-- fallback via 'ByteString' instance
instance Binary ShortText where
    put = put . toByteString
    get = do
        bs <- get
        case fromByteString bs of
          Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream"
          Just st -> return st
#endif

-- | /O(1)/ Test whether a 'ShortText' is empty.
null :: ShortText -> Bool
null = BSS.null . toShortByteString

-- | /O(n)/ Count the number of Unicode code-points in a 'ShortText'.
length :: ShortText -> Int
length st = fromIntegral $ unsafePerformIO (c_text_short_length (toByteArray# st) (toCSize st))

foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize

-- | /O(n)/ Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F).
isAscii :: ShortText -> Bool
isAscii st = (== sz) $ unsafePerformIO (c_text_short_is_ascii (toByteArray# st) sz)
  where
    sz = toCSize st

foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CSize

----------------------------------------------------------------------------

toCSize :: ShortText -> CSize
toCSize = fromIntegral . BSS.length . toShortByteString

toByteArray# :: ShortText -> ByteArray#
toByteArray# (ShortText (BSSI.SBS ba#)) = ba#

-- | /O(0)/ Converts to UTF-8 encoded 'ShortByteString'
--
-- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
toShortByteString :: ShortText -> ShortByteString
toShortByteString (ShortText b) = b

-- | /O(n)/ Converts to UTF-8 encoded 'BS.ByteString'
toByteString :: ShortText -> BS.ByteString
toByteString = BSS.fromShort . toShortByteString

-- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8.
toBuilder :: ShortText -> BB.Builder
toBuilder = BB.shortByteString . toShortByteString

-- | /O(n)/ Convert to 'String'
toString :: ShortText -> String
toString = decodeStringShort' utf8 . toShortByteString

-- | /O(n)/ Convert to 'T.Text'
--
-- This is currently not /O(1)/ because currently 'T.Text' uses UTF-16 as its internal representation.
-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become /O(1)/.
toText :: ShortText -> T.Text
toText = T.decodeUtf8 . toByteString

----

-- | /O(n)/ Construct/pack from 'String'
--
-- Note: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
fromString :: String -> ShortText
fromString = ShortText . encodeStringShort utf8 . map r
  where
    r c | 0xd800 <= x && x < 0xe000 = '\xFFFD'
        | otherwise                 = c
      where
        x = ord c

-- | /O(n)/ Construct 'ShortText' from 'T.Text'
--
-- This is currently not /O(1)/ because currently 'T.Text' uses UTF-16 as its internal representation.
-- In the event that 'T.Text' will change its internal representation to UTF-8 this operation will become /O(1)/.
fromText :: T.Text -> ShortText
fromText = fromByteString' . T.encodeUtf8

-- | /O(n)/ Construct 'ShortText' from UTF-8 encoded 'ShortByteString'
--
-- This operation doesn't copy the input 'ShortByteString' but it
-- cannot be /O(1)/ because we need to validate the UTF-8 encoding.
--
-- Returns 'Nothing' in case of invalid UTF-8 encoding.
fromShortByteString :: ShortByteString -> Maybe ShortText
fromShortByteString sbs
  | isValidUtf8 st  = Just st
  | otherwise       = Nothing
  where
    st = ShortText sbs

-- | /O(n)/ Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
--
-- Returns 'Nothing' in case of invalid UTF-8 encoding.
fromByteString :: BS.ByteString -> Maybe ShortText
fromByteString = fromShortByteString . BSS.toShort

----------------------------------------------------------------------------

-- non-validating
fromByteString' :: BS.ByteString -> ShortText
fromByteString' = ShortText . BSS.toShort



----------------------------------------------------------------------------

encodeString :: TextEncoding -> String -> BS.ByteString
encodeString te str = unsafePerformIO $ GHC.withCStringLen te str BS.packCStringLen

-- decodeString :: TextEncoding -> BS.ByteString -> Maybe String
-- decodeString te bs = cvtEx $ unsafePerformIO $ try $ BS.useAsCStringLen bs (GHC.peekCStringLen te)
--   where
--     cvtEx :: Either IOException a -> Maybe a
--     cvtEx = either (const Nothing) Just

decodeString' :: TextEncoding -> BS.ByteString -> String
decodeString' te bs = unsafePerformIO $ BS.useAsCStringLen bs (GHC.peekCStringLen te)

decodeStringShort' :: TextEncoding -> ShortByteString -> String
decodeStringShort' te = decodeString' te . BSS.fromShort

encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString
encodeStringShort te = BSS.toShort . encodeString te


isValidUtf8 :: ShortText -> Bool
isValidUtf8 st = (==0) $ unsafePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))

foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt

{- TODO:
{-# RULES "ShortText strlit" forall s . fromString (unpackCString# s) = fromAddr# #-}
...
-}