-- | Intended for unqualified import: -- -- > import module Codec.Compression.SnappyC.Internal.FrameFormat module Codec.Compression.SnappyC.Internal.FrameFormat ( -- * The 'Frame' type Frame(..) , FrameIdentifier(..) -- ** Encoding , Encoder(..) , EncodeParams(..) , FrameSize -- Opaque , Threshold(..) , EncodeState(..) , EncodeResult(..) , initializeEncoder , defaultEncodeParams , finalizeEncoder , encodeBuffered , customFrameSize , unFrameSize , defaultFrameSize -- ** Decoding , Decoder(..) , DecodeParams(..) , DecodeState(..) , DecodeResult(..) , DecodeFailure(..) , initializeDecoder , defaultDecodeParams , finalizeDecoder , decodeBuffered ) where import Codec.Compression.SnappyC.Internal.Buffer (Buffer) import Codec.Compression.SnappyC.Internal.Buffer qualified as Buffer import Codec.Compression.SnappyC.Internal.Checksum (Checksum) import Codec.Compression.SnappyC.Internal.Checksum qualified as Checksum import Codec.Compression.SnappyC.Raw qualified as Raw import Control.Exception import Control.Monad.Error.Class import Data.Bits import Data.ByteString qualified as Strict (ByteString) import Data.ByteString qualified as BS.Strict import Data.Default import Data.Word import Text.Printf import Control.Monad -- | Snappy frames consist of a header and a payload. data Frame = Frame { frameHeader :: !FrameHeader , framePayload :: !Strict.ByteString } deriving Show -- | A frame's header contains an identifier corresponding to the frame type and -- the size of the payload. data FrameHeader = FrameHeader { frameHeaderIdentifier :: !FrameIdentifier , frameHeaderPayloadSize :: !Int } deriving (Show, Eq) -- | Snappy frame identifiers. data FrameIdentifier = StreamId | Compressed | Uncompressed | Padding | ReservedUnskippable Word8 | ReservedSkippable Word8 deriving (Show, Eq) -- | The one byte value corresponding to an identifier. encodeFrameIdentifier :: FrameIdentifier -> Word8 encodeFrameIdentifier StreamId = 0xff encodeFrameIdentifier Compressed = 0x00 encodeFrameIdentifier Uncompressed = 0x01 encodeFrameIdentifier Padding = 0xfd encodeFrameIdentifier (ReservedUnskippable fid) = fid encodeFrameIdentifier (ReservedSkippable fid) = fid -- | Encode an 'Int' as a three byte little-endian value. encodeFrameHeader :: FrameHeader -> Strict.ByteString encodeFrameHeader (FrameHeader ident size)= BS.Strict.pack [ encodeFrameIdentifier ident , fromIntegral (w32Size .&. 0x000000ff) , fromIntegral $ (w32Size .&. 0x0000ff00) `shiftR` 8 , fromIntegral $ (w32Size .&. 0x00ff0000) `shiftR` 16 ] where w32Size :: Word32 w32Size = fromIntegral size ------------------------------------------------------------------------------- -- Encoding Snappy frames ------------------------------------------------------------------------------- -- | Buffers uncompressed data for compression. newtype Encoder = Encoder { encoderBuffer :: Buffer } deriving Show -- | Determines how much data is put in each Snappy frame and whether it is -- compressed. data EncodeParams = EncodeParams { -- | Exact amount of uncompressed data included in a single frame. frameSize :: !FrameSize -- | Compression threshold. , threshold :: !Threshold } deriving (Show, Eq) instance Default EncodeParams where def :: EncodeParams def = defaultEncodeParams defaultEncodeParams :: EncodeParams defaultEncodeParams = EncodeParams def def -- | Number of bytes of uncompressed data. newtype FrameSize = FrameSize Int deriving (Show, Eq) instance Default FrameSize where def :: FrameSize def = defaultFrameSize -- | The default frame size is 65536 bytes, which is the maximum allowed by the -- [Snappy framing format -- description](https://github.com/google/snappy/blob/main/framing_format.txt). defaultFrameSize :: FrameSize defaultFrameSize = FrameSize snappySpecMaxChunkBytes -- | See section 4.2 of the [Snappy framing format -- description](https://github.com/google/snappy/blob/main/framing_format.txt). snappySpecMaxChunkBytes :: Int snappySpecMaxChunkBytes = 65536 -- | Create a 'FrameSize'. -- -- Must be within the inclusive range [ 1 .. 65536 ]. customFrameSize :: Int -> FrameSize customFrameSize n | n >= 1 && n <= snappySpecMaxChunkBytes = FrameSize n | otherwise = error "customFrameSize: invalid frame size" -- | Unwrap a 'FrameSize' unFrameSize :: FrameSize -> Int unFrameSize (FrameSize n) = n -- | Compression threshold, with explicit 'AlwaysCompress' and 'NeverCompress' -- settings. data Threshold = -- | Compress everything AlwaysCompress -- | Compress nothing | NeverCompress -- | Uncompressed size divided by compressed size. -- -- Only produce compressed frames if the compression ratio for the data is -- equal to or above this threshold. -- -- A higher threshold may result in less frames holding compressed data, -- and thus faster decompression/decoding. -- -- [According to -- Google](https://github.com/google/snappy?tab=readme-ov-file#performance), -- the typical highest compression ratio that Snappy achieves is about 4, -- so a 'Ratio' of > 4.0 should be similar to 'NeverCompress', while a -- 'Ratio' of < 7/8 should be similar to 'AlwaysCompress'. | Ratio !Double deriving (Show, Eq) instance Default Threshold where def :: Threshold def = defaultThreshold -- | The default threshold is a ratio of 8:7, which was taken from the -- [golang/snappy -- implementation](https://github.com/golang/snappy/blob/43d5d4cd4e0e3390b0b645d5c3ef1187642403d8/encode.go#L231). defaultThreshold :: Threshold defaultThreshold = Ratio (1 / 0.875) -- | Determines how much uncompressed data is stored in each resulting frame. newtype EncodeState = EncodeState { encodeStateMaxChunkBytes :: Int } deriving Show -- | A pair of frame-encoded chunks and an updated 'Encoder'. data EncodeResult = EncodeResult { encodeResultEncoded :: [Strict.ByteString] , encodeResultEncoder :: !Encoder } deriving Show -- | Initialize an 'Encoder' with the given maximum number of bytes of -- uncompressed data to include in frames resulting from the 'Encoder'. If the -- given number of bytes is not in the inclusive range [1 .. 65536], 65536 is -- used. -- -- The 'Strict.ByteString' holds the Snappy stream identifier frame that must be -- included at the start of every Snappy frame encoded stream. initializeEncoder :: (Strict.ByteString, Encoder) initializeEncoder = ( "\xff\x06\x00\00sNaPpY" , Encoder { encoderBuffer = Buffer.empty } ) -- | Call to indicate no more input and flush the remaining data in the -- 'Encoder' into a new frame. -- -- If there is no more data in the 'Encoder', an empty list is returned. -- -- * __Precondition:__ The buffer does not hold more data than the 'frameSize' -- in the 'EncodeParams'. Use the postcondition of 'encodeBuffered' to ensure -- this. finalizeEncoder :: EncodeParams -> Encoder -> [Strict.ByteString] finalizeEncoder ep (Encoder b) | Buffer.null b = [] | otherwise = encodeChunk ep (Buffer.toStrict b) -- | Fill and compress/encode as many frames as possible with the data in the -- 'Encoder'. -- -- /O(1)/ if there are not enough bytes in the buffer to fill a frame. -- -- * __Postcondition:__ The resulting buffer never holds more than the -- 'frameSize' given in the 'EncodeParams'. encodeBuffered :: EncodeParams -> Encoder -> EncodeResult encodeBuffered ep@EncodeParams{..} = \(Encoder b) -> go [] b where go :: [Strict.ByteString] -> Buffer -> EncodeResult go acc b = case Buffer.splitExactly (unFrameSize frameSize) b of Right (chunk, b') -> go (reverse (encodeChunk ep chunk) ++ acc) b' Left _ -> EncodeResult (reverse acc) (Encoder b) -- | Encode the input as a potentially compressed Snappy frame. -- -- This function takes a 'Strict.ByteString' because it must pass all of the -- data to a C function which expects the data to sit in a single buffer. encodeChunk :: EncodeParams -> Strict.ByteString -> [Strict.ByteString] encodeChunk EncodeParams{..} uncompressed = [ encodeFrameHeader (FrameHeader frameId (BS.Strict.length payloadData + 4)) , Checksum.encode maskedChecksum , payloadData ] where maskedChecksum :: Checksum maskedChecksum = Checksum.calculate uncompressed compressed :: Strict.ByteString compressed = Raw.compress uncompressed compressionRatio :: Double compressionRatio = (/) @Double (fromIntegral $ BS.Strict.length uncompressed) (fromIntegral $ BS.Strict.length compressed) (frameId, payloadData) = if doCompress then (Compressed, compressed) else (Uncompressed, uncompressed) doCompress = case threshold of AlwaysCompress -> True NeverCompress -> False Ratio ratio -> compressionRatio >= ratio ------------------------------------------------------------------------------- -- Decoding Snappy frames ------------------------------------------------------------------------------- -- | Buffers compressed data for decompression and holds some useful -- decompression state. data Decoder = Decoder { -- | Accumulated Snappy framed data. -- -- * __Invariant:__ This buffer never holds a fully decodable Snappy -- frame. decoderBuffer :: !Buffer -- | Tracks partial information about the buffer, e.g. whether we have -- decoded a header and how many bytes we need to fully decode a frame. , decoderState :: !DecodeState } deriving Show -- | Have we decoded a header for the current frame yet? -- -- If so, what was that header? data DecodeState = Initial | KnownHeader !FrameHeader deriving (Show, Eq) -- | Pair of decompressed data chunks and an updated 'Decoder'. data DecodeResult = DecodeResult { decodeResultDecoded :: [Strict.ByteString] , decodeResultDecoder :: !Decoder } deriving Show -- | Decode parameters data DecodeParams = DecodeParams { -- | Verify the uncompressed data checksums during decompression -- -- Defaults to 'False'. Even if we don't verify the CRC, if the data -- is not Snappy compressed then decompression will likely still fail -- due to failing to decode the frame headers. -- -- To enable this, use the incremental API -- ('Codec.Compression.SnappyC.Framed.decompressStep'). Note that -- checksum verification adds a significant overhead to decompression. verifyChecksum :: !Bool } deriving (Show, Eq) instance Default DecodeParams where def :: DecodeParams def = defaultDecodeParams -- | Default decode parameters defaultDecodeParams :: DecodeParams defaultDecodeParams = DecodeParams False -- | Possible failure modes for decompression. data DecodeFailure = DecompressionError Strict.ByteString | ReservedUnskippableFrameId Word8 | BadStreamId Strict.ByteString | BadChecksum Strict.ByteString -- ^ Data Checksum -- ^ Received Checksum -- ^ Computed | NotDone deriving Show deriving anyclass Exception -- | The empty 'Decoder', in an initial state. initializeDecoder :: Decoder initializeDecoder = Decoder { decoderBuffer = Buffer.empty , decoderState = Initial } -- | Verify that the 'Decoder' is complete. -- -- If the 'Decoder'\'s buffer still has data in it, 'NotDone' is returned. finalizeDecoder :: Decoder -> Either DecodeFailure () finalizeDecoder Decoder{..} | decoderState /= Initial || not (Buffer.null decoderBuffer) = throwError NotDone | otherwise = return () -- | Decompress/decode as many frames as possible with the data in the -- 'Decoder'. -- -- This is not as lazy as it could be. If we had a version of 'decodeFrame' that -- threw exceptions on failure, we could be a bit more lazy. It's not clear to -- me if this would actually be good for performance. -- -- /O(1)/ if there are insufficient bytes in the buffer. decodeBuffered :: DecodeParams -> Decoder -> Either DecodeFailure DecodeResult decodeBuffered dps = go [] where go :: [Strict.ByteString] -> Decoder -> Either DecodeFailure DecodeResult go acc (Decoder b state@Initial) = case Buffer.splitExactly 4 b of Right (headerBs, rest) -> do header <- decodeHeader headerBs go acc (Decoder rest (KnownHeader header)) Left _ -> return $ DecodeResult (reverse acc) $ Decoder b state go acc (Decoder b state@(KnownHeader header)) = case Buffer.splitExactly (frameHeaderPayloadSize header) b of Right (payloadBs, rest) -> do uncompressed <- decodeFrame dps header payloadBs go (maybe acc (: acc) uncompressed) (Decoder rest Initial) Left _ -> return $ DecodeResult (reverse acc) $ Decoder b state -- | Decode header -- -- __Precondition:__ The given 'Strict.ByteString' must be exactly 4 bytes -- long. decodeHeader :: Strict.ByteString -> Either DecodeFailure FrameHeader decodeHeader bs = case BS.Strict.unpack bs of [bid, b1, b2, b3] -> let payloadLen = lEWord24BytesToInt (b1, b2, b3) in case bid of 0xff -> return $ FrameHeader StreamId payloadLen 0xfd -> return $ FrameHeader Padding payloadLen 0x00 -> return $ FrameHeader Compressed payloadLen 0x01 -> return $ FrameHeader Uncompressed payloadLen fid | fid `elem` [ 0x02 .. 0x7f ] -> return $ FrameHeader (ReservedUnskippable fid) payloadLen | fid `elem` [ 0x80 .. 0xfd ] -> return $ FrameHeader (ReservedSkippable fid) payloadLen | otherwise -> error $ printf ( "FrameFormat.decodeHeader: " ++ "impossible frame identifier 0x%x" ) fid _ -> error "FrameFormat.decodeHeader: precondition violated" where lEWord24BytesToInt :: (Word8, Word8, Word8) -> Int lEWord24BytesToInt (lsb, mid, msb) = fromIntegral msb `shiftL` 16 .|. fromIntegral mid `shiftL` 8 .|. fromIntegral lsb -- | Decode a frame -- -- __Precondition:__ The given 'Strict.ByteString' is the payload associated -- with the given 'FrameHeader'. decodeFrame :: DecodeParams -> FrameHeader -> Strict.ByteString -> Either DecodeFailure (Maybe Strict.ByteString) decodeFrame dps header bs = case frameHeaderIdentifier header of StreamId -> if bs == "sNaPpY" then return Nothing else throwError $ BadStreamId bs Compressed -> do let !(checksumBs, rest) = BS.Strict.splitAt 4 bs uncompressed <- case Raw.decompress rest of Nothing -> throwError $ DecompressionError rest Just decompressed -> return decompressed Just <$> verifyPayload dps checksumBs uncompressed Uncompressed -> do let !(checksumBs, uncompressed) = BS.Strict.splitAt 4 bs Just <$> verifyPayload dps checksumBs uncompressed Padding -> return Nothing ReservedUnskippable fid -> -- An unskippable reserved frame is one that has an important payload, -- but we don't know what it is so we can't decode it. throwError $ ReservedUnskippableFrameId fid ReservedSkippable _ -> return Nothing -- | If checksum verification is enabled, compute the checksum and compare -- against the decoded checksum. verifyPayload :: DecodeParams -> Strict.ByteString -- ^ Encoded little-endian checksum -> Strict.ByteString -- ^ Uncompressed payload -> Either DecodeFailure Strict.ByteString verifyPayload dps checksumBs uncompressed = do when (verifyChecksum dps) $ do let decodedChecksum = Checksum.decode checksumBs computedChecksum = Checksum.calculate uncompressed when (computedChecksum /= decodedChecksum) $ throwError $ BadChecksum uncompressed decodedChecksum computedChecksum return uncompressed