{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE LambdaCase #-}
{-# OPTIONS_HADDOCK show-extensions #-}





-- | Internals, exposed mostly for potential use by testsuites and benchmarks.
--
-- __Not recommended to be used from within other independent libraries.__
module Data.Binary.Typed.Internal (

      -- * 'Typed'
        Typed(..)
      , Typed'(..)
      , TypeInformation(..)
      , Hash5(..)
      , mkHash5
      , Hash32(..)
      , Hash64(..)
      , typed
      , makeTypeInformation
      , TypeFormat(..)
      , getFormat
      , typecheck
      , typecheck'
      , erase
      , preserialize

      -- * 'TypeRep'
      , TypeRep(..)
      , stripTypeRep
      , unStripTypeRep
      , hashType5
      , hashed5Split
      , hashType32
      , hashType64

      -- * 'TyCon'
      , TyCon(..)
      , stripTyCon
      , unStripTyCon

) where



import           GHC.Generics
import           Text.Printf
import           Data.Bits ((.&.), (.|.))
import           Control.Applicative
-- import qualified Data.ByteString      as BS
import qualified Data.ByteString.Lazy as BSL

import           Data.Typeable (Typeable, typeOf)
import qualified Data.Typeable as Ty

import           Data.Binary

-- Crypto stuff for hashing
import qualified Data.Digest.Murmur32 as H32
import qualified Data.Digest.Murmur64 as H64



-- ^ Type information stored alongside a value to be serialized, so that the
-- recipient can do consistency checks. See 'TypeFormat' for more detailed
-- information on the fields.
data TypeInformation = Untyped'
                     | Hashed5'  Hash5
                     | Hashed32' Hash32
                     | Hashed64' Hash64
                     | Shown'    Hash32 String
                     | Full'     TypeRep
                     | Cached'   BSL.ByteString -- ^ Pre-serialized representation
                                                -- of one of the other fields.
                     deriving (Eq, Ord, Show, Generic)

instance Binary TypeInformation where
      put Untyped'             = putWord8 0
      put (Hashed5' (Hash5 x)) = putWord8 (x .|. 1) -- See 'Hash5' for info
      put (Hashed32' x)        = putWord8 2 *> put x
      put (Hashed64' x)        = putWord8 3 *> put x
      put (Shown'    x y)      = putWord8 4 *> put x *> put y
      put (Full'     x)        = putWord8 5 *> put x
      put (Cached'   x)        = putWord8 6 *> put x

      get = getWord8 >>= \case
            0 -> return Untyped'
            -- "1" case handled at the end
            2 -> fmap   Hashed32' get
            3 -> fmap   Hashed64' get
            4 -> liftA2 Shown'    get get
            5 -> fmap   Full'     get
            6 -> fmap   Cached'   get
            n -> case hashed5Split n of
                  (1, hash) -> return (Hashed5' hash)
                  _ -> fail ("Invalid TypeInformation (tag: " ++ show (hashed5Split n) ++ ")")



-- | Split a 'Word8' into the last 3 bit (used to tag the constructor) and
-- the first 5 (data payload). Used by the 'Binary' instance of
-- 'TypeInformation'.
hashed5Split :: Word8 -> (Word8, Hash5)
hashed5Split x = let hash = mkHash5 x
                     tag  = getHashed5Tag x
                 in  (tag, hash)



getHashed5Tag :: Word8 -> Word8
getHashed5Tag = (.&. 0x7) -- = 00000111



-- | Extract which 'TypeFormat' was used to create a certain 'TypeInformation'.
--
-- If the type is 'Cached'', then the contained information is assumed
-- well-formed. In the public API, this is safe to do, since only well-typed
-- 'Typed' values can be created in the first place.
getFormat :: TypeInformation -> TypeFormat
getFormat (Untyped'  {}) = Untyped
getFormat (Hashed5'  {}) = Hashed5
getFormat (Hashed32' {}) = Hashed32
getFormat (Hashed64' {}) = Hashed64
getFormat (Shown'    {}) = Shown
getFormat (Full'     {}) = Full
getFormat (Cached'   bs) = getFormat (decode bs)



-- | A 5-bit hash value.
--
-- Since 'TypeInformation' needs 3 bit to store the sort of the
-- 'TypeInformation', the remaining 5 bit per 'Word8' can be used to store a
-- hash value at no additional space cost. For this reason, it is important that
-- the three rightmost bits of any 'Hashed5' are set to zero, i.e. @('.&.' 7)@
-- is 'id' on the contained 'Word8'.
--
-- This type intentionally doesn't have a 'Binary' instance, since its
-- serialization is part of the 'TypeInformation' 'Binary' instance exclusively.
newtype Hash5 = Hash5 Word8
      deriving (Eq, Ord, Show)

-- | Smart constructor for 'Hash5' values. Makes sure the rightmost three bits
-- are not set by applying a bit mask to the input.
mkHash5 :: Integral a => a -> Hash5
mkHash5 x = Hash5 (fromIntegral x .&. 0xF8)
                                    -- = 11111000



-- | A 32-bit hash value.
newtype Hash32 = Hash32 Word32
      deriving (Eq, Ord, Show, Generic)
instance Binary Hash32



-- | A 64-bit hash value.
newtype Hash64 = Hash64 Word64
      deriving (Eq, Ord, Show, Generic)
instance Binary Hash64



-- | A value suitable to be typechecked using the contained extra type
-- information.
data Typed a = Typed TypeInformation a
      -- ^ Using this data constructor directly is unsafe, as it allows
      -- construction of ill-typed 'Typed' data. Use the 'typed' smart
      -- constructor unless you really need 'Typed'.

-- | "typed \<format\> \<value\>"
instance Show a => Show (Typed a) where
      show (Typed ty x) = printf "typed %s (%s)"
                                 (show (getFormat ty))
                                 (show x)

-- | Ensures data is decoded as the appropriate type with high or total
-- confidence (depending on with what 'TypeFormat' the 'Typed' was
-- constructed).
instance (Binary a, Typeable a) => Binary (Typed a) where
      get = do -- Explicitly get both values instead of a (ty,value) tuple
               -- in case Binary changes in the future. This ensures caching
               -- in 'decodeTyped' can rely on the two values coming in
               -- in this particular way.
               ty    <- get
               value <- get
               case typecheck (Typed ty value) of
                     Left err -> fail err -- NB: 'fail' is safe in Get Monad
                     Right wellTyped -> return wellTyped

      put (Typed ty value) = put ty *> put value



-- | Like 'Typed', but the type information is not checked. Useful to read type
-- and value, and do the typechecking externally, as required by the caching
-- of 'Data.Binary.Typed.decodeTyped'. Using 'typecheck'', this can be promoted
-- to a proper 'Typed' value.
data Typed' a = Typed' TypeInformation a

-- | "Typed' \<format\> \<value\>"
instance Show a => Show (Typed' a) where
      show (Typed' ty x) = printf "Typed' %s (%s)"
                                 (show (getFormat ty))
                                 (show x)

instance (Binary a) => Binary (Typed' a) where
      get = liftA2 Typed' get get
      put (Typed' ty value) = put ty *> put value



-- | Sometimes it can be beneficial to serialize the type information in
-- advance, so that the maybe costly serialization step does not have to be
-- repeated on every invocation of 'encode'. Preserialization comes at a price
-- though, as the directly contained 'BSL.ByteString' requires its length to
-- be included in the final serialization, yielding a 8-byte overhead for the
-- required 'Data.Int.Int64', and one for the tag of what was serialized
-- (\"shown or full?\").
--
-- This function calculates the serialized version of 'TypeInformation' in
-- cases where the required 9 bytes are negligible (determined by an
-- arbitrary threshold, currently 10*9 bytes).
--
-- Used to make 'Data.Binary.Typed.encodeTyped' more efficient; the source
-- there also makes a good usage example.
preserialize :: TypeInformation -> TypeInformation
preserialize x@(Cached'   {}) = x
preserialize x@(Untyped'  {}) = x
preserialize x@(Hashed5'  {}) = x
preserialize x@(Hashed32' {}) = x
preserialize x@(Hashed64' {}) = x
-- Explicit cases for Shown' and Full' so exhaustiveness can be checked when
-- new constructors are added. (The default pattern of just "x" would do right
-- now as well, but not provide that.)
preserialize x@(Shown'    {}) = preserialize' x
preserialize x@(Full'     {}) = preserialize' x



-- | Preserializes type information if its encoded byte length is larger than
-- an arbitrary threshold. Less efficient than 'preserialize' since it
-- always preserializes and always calculates the encoded version no matter
-- what.
preserialize' :: TypeInformation -> TypeInformation
preserialize' x | BSL.length encoded > 10*9 = Cached' encoded
                | otherwise = x
                where encoded = encode x



-- | Different ways of including/verifying type information of serialized
--   messages.
data TypeFormat =

        -- | Include no type information.
        --
        -- * Requires one byte more compared to using 'Binary' directly
        --   (to tag the data as untyped, required for the decoding step).
        -- * Encoding and decoding require negligible amount of additional
        --   computational cost compared to direct (intrinsically untyped)
        --   'Binary'.
        Untyped

        -- | Like 'Hashed32', but uses a 5-bit hash value.
        --
        -- * Requires the same amount of space as 'Untyped', i.e. the only
        --   overhead compared to it is the computational cost to calculate
        --   the hash, which is almost identical to the one of 'Hashed32'.
        -- * Collisions occur with a probability of 1\/2^5 = 1\/32. For this
        --   reason, this format is only recommended when minimal data size
        --   is top priority.
        --
      | Hashed5

        -- | Compare types by their hash values (using the MurmurHash2
        -- algorithm).
        --
        -- * Requires five bytes more compared to using 'Binary' directly for
        --   the type information (one to tag as 'Hashed32', four for the
        --   hash value)
        -- * Subject to false positive due to hash collisions, although in
        --   practice this should almost never happen.
        -- * Type errors cannot tell the provided type ("Expected X, received
        --   type with hash H")
        -- * Computational cost similar to 'Hashed64'.
      | Hashed32

        -- | Like 'Hashed32', but uses a 64-bit hash value.
        --
        -- * Requires nine bytes more compared to using 'Binary'.
        -- * Hash collisions are even less likely to occur than with
        --   'Hashed32'.
        -- * Computational cost similar to 'Hashed32'.
      | Hashed64

        -- | Compare 'String' representation of types, obtained by calling
        -- 'show' on the 'TypeRep', and also include a hash value
        -- (like 'Hashed32'). The former is mostly for readable error
        -- messages, the latter provides better collision resistance.
        --
        -- * Data size larger than 'Hashed32', but usually smaller than
        --   'Full'.
        -- * Both the hash and the shown type must match to satisfy the
        --   typechecker.
        -- * Useful type errors ("expected X, received Y"). All types are
        --   shown unqualified though, making @Foo.X@ and @Bar.X@ look
        --   identical in error messages. Remember this when you get a
        --   seemingly silly error "expected Foo, but given Foo".
      | Shown

        -- | Compare the full representation of a data type.
        --
        -- * More verbose than 'Shown'. As a rule of thumb, transmitted data is
        --   roughly the same as 'Shown', but all names are fully qualified
        --   (package, module, type name).
        -- * Correct comparison (no false positives). An semi-exception here
        --   is when types change between package versions:
        --   @package-1.0 Foo.X@ and @package-1.1 Foo.X@ count as the same
        --   type.
        -- * Useful type errors ("expected X, received Y"). All types are
        --   shown unqualified though, making @Foo.X@ and @Bar.X@ look
        --   identical in error messages. Remember this when you get a
        --   seemingly silly error "expected Foo, but given Foo".
      | Full

      deriving (Eq, Ord, Show)



-- | Construct a 'Typed' value using the chosen type format.
--
-- Example:
--
-- @
-- value = 'typed' 'Full' ("hello", 1 :: 'Int', 2.34 :: 'Double')
-- encoded = 'encode' value
-- @
--
-- The decode site can now verify whether decoding happens with the right type.
typed :: Typeable a => TypeFormat -> a -> Typed a
typed format x = Typed (makeTypeInformation format (typeOf x)) x



-- | Create the 'TypeInformation' to be stored inside a 'Typed' value from
-- a 'Ty.TypeRep'.
makeTypeInformation :: TypeFormat -> Ty.TypeRep -> TypeInformation
makeTypeInformation Untyped  _  = Untyped'
makeTypeInformation Hashed5  ty = Hashed5'   (hashType5    ty)
makeTypeInformation Hashed32 ty = Hashed32'  (hashType32   ty)
makeTypeInformation Hashed64 ty = Hashed64'  (hashType64   ty)
makeTypeInformation Shown    ty = Shown'     (hashType32   ty) (show ty)
makeTypeInformation Full     ty = Full'      (stripTypeRep ty)



-- | Extract the value of a 'Typed', i.e. strip off the explicit type
-- information.
--
-- This function is safe to use for all 'Typed' values created by the public
-- API, since all construction sites ensure the actual type matches the
-- contained type description.
--
-- @
-- 'erase' ('typed' format x) == x
-- @
erase :: Typed a -> a
erase (Typed _ty value) = value



-- | Typecheck a 'Typed'. Returns the (well-typed) input, or an error message
-- if the types don't work out.
typecheck :: Typeable a => Typed a -> Either String (Typed a)
typecheck ty@(Typed typeInformation x) = case typeInformation of
      Cached' cache -> decode' cache >>= \ty' -> typecheck (Typed ty' x)
      Full'     full     | exFull /= full     -> Left (fullError full)
      Hashed5'  hash5    | exHash5 /= hash5   -> Left (hashError exHash5  hash5)
      Hashed32' hash32   | exHash32 /= hash32 -> Left (hashError exHash32 hash32)
      Hashed64' hash64   | exHash64 /= hash64 -> Left (hashError exHash64 hash64)
      Shown'  hash32 str | (exHash32, exShow) /= (hash32, str)
                                              -> Left (shownError hash32 str)
      _no_type_error -> Right ty


      where

      -- ex = expected
      exType   = typeOf x
      exHash5  = hashType5    exType
      exHash32 = hashType32   exType
      exHash64 = hashType64   exType
      exShow   = show         exType
      exFull   = stripTypeRep exType

      hashError eHash hash = printf pat exShow (show eHash) (show hash)
            where pat = "Type error: expected type %s with hash %s,\
                        \ but received data with hash %s"
      shownError hash str = printf pat exShow (show exHash32) str (show hash)
            where pat = "Type error: expected type %s and hash %s,\
                        \ but received data with type %s and hash %s"
      fullError full = printf pat exShow (show full)
            where pat = "Type error: expected type %s,\
                        \ but received data with type %s"

      decode' bs = case decodeOrFail bs of
            Left  (_,_,err) -> Left  ("Cache error! " ++ err)
            Right (_,_,val) -> Right val



-- | Typecheck a 'Typed\'' value so it can be used as a safe 'Typed' value.
typecheck' :: Typeable a => Typed' a -> Either String (Typed a)
typecheck' (Typed' ty value) = typecheck (Typed ty value)



-- | Hash a 'Ty.TypeRep' to a 5-bit digest.
hashType5 :: Ty.TypeRep -> Hash5
hashType5 = mkHash5 . H32.asWord32 . H32.hash32 . stripTypeRep



-- | Hash a 'Ty.TypeRep' to a 32-bit digest.
hashType32 :: Ty.TypeRep -> Hash32
hashType32 = Hash32 . H32.asWord32 . H32.hash32 . stripTypeRep



-- | Hash a 'Ty.TypeRep' to a 64-bit digest.
hashType64 :: Ty.TypeRep -> Hash64
hashType64 = Hash64 . H64.asWord64 . H64.hash64 . stripTypeRep



-- | 'Ty.TypeRep' without the (internal) fingerprint.
data TypeRep = TypeRep TyCon [TypeRep]
      deriving (Eq, Ord, Generic)
instance Binary TypeRep

instance Show TypeRep where
      show = show . unStripTypeRep

instance H32.Hashable32 TypeRep where
      hash32Add (TypeRep tycon args) = H32.hash32Add (tycon, args)

instance H64.Hashable64 TypeRep where
      hash64Add (TypeRep tycon args) = H64.hash64Add (tycon, args)




-- | 'Ty.TyCon' without the (internal) fingerprint.
data TyCon = TyCon String String String -- ^ Package, module, constructor name
      deriving (Eq, Ord, Generic)
instance Binary TyCon

instance Show TyCon where
      show = show . unStripTyCon

instance H32.Hashable32 TyCon where
      hash32Add (TyCon p m c) = H32.hash32Add (p, m, c)

instance H64.Hashable64 TyCon where
      hash64Add (TyCon p m c) = H64.hash64Add (p, m, c)



-- | Strip a 'Ty.TypeRep' off the fingerprint. Inverse of 'unStripTypeRep'.
stripTypeRep :: Ty.TypeRep -> TypeRep
stripTypeRep typerep = TypeRep (stripTyCon tycon) (map stripTypeRep args)
      where (tycon, args) = Ty.splitTyConApp typerep



-- | Add a fingerprint to a 'TypeRep'. Inverse of 'stripTypeRep'.
unStripTypeRep :: TypeRep -> Ty.TypeRep
unStripTypeRep (TypeRep tyCon args) = Ty.mkTyConApp (unStripTyCon tyCon)
                                                    (map unStripTypeRep args)



-- | Strip a 'Ty.TyCon' off the fingerprint. Inverse of 'unStripTyCon'.
stripTyCon :: Ty.TyCon -> TyCon
stripTyCon tycon = TyCon (Ty.tyConPackage tycon)
                         (Ty.tyConModule  tycon)
                         (Ty.tyConName    tycon)
                         -- The Typeable API doesn't expose the
                         -- TyCon constructor, so pattern matching
                         -- is not possible here (without depending
                         -- on Typeable.Internal).



-- | Add a fingerprint to a 'TyCon'. Inverse of 'stripTyCon'.
unStripTyCon :: TyCon -> Ty.TyCon
unStripTyCon (TyCon p m n) = Ty.mkTyCon3 p m n -- package, module, name