{-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskell #-} -- To read all the plaintext of a CHK share which you have enough shares for: -- (-1). Find and download the shares -- ( 0). Parse the share bytes into the various fields -- 1. Check the UEB (URI Extension Block) hash -- 2. Decode the UEB to find the share root hash -- 3. Build the block hash tree for all shares you have -- 4. Build the share hash tree out of those block hash tree roots combined with all of the "needed hashes" you pulled out of the shares you have -- 5. Check the root of the share hash tree against the value in the UEB -- 6. ZFEC decode the blocks into ciphertext ** -- 7. Check the "crypttext hash" against the hash of the ciphertext -- (maybe helps catch a ZFEC implementation bug?) -- 8. Decrypt the ciphertext ** -- 3 of 4 -- Have 4, 5, 6 -- neededHashes a == [ 5, 6, 7 ] -- 1 -- 2 3 -- 4 5 6 7 -- a b c d -- ^ -- 5+"5s hash"+6+"6s hash"+7+"7s hash" {- | A share is a single data object comprising some erasure-encoded data and some cryptographic hashes which allow certain determinations to be made about that that data. One or more shares can be interpreted together, typically to recover a particular ciphertext object. This modules exposes a structured representation of the share object along with an encoder to and decoder from the canonical serialized representation. -} module Tahoe.CHK.Share where import Control.Exception (Exception, throw) import Control.Lens (makeLenses) import Crypto.Hash (HashAlgorithm (hashDigestSize), digestFromByteString) import Data.Bifunctor (Bifunctor (bimap)) import Data.Binary ( Binary (get, put), Word32, Word64, Word8, encode, ) import Data.Binary.Get ( Get, bytesRead, getLazyByteString, isolate, ) import Data.Binary.Put (Put, putLazyByteString) import Data.Bits (shiftL, (.|.)) import qualified Data.ByteString as BS import qualified Data.ByteString.Builder as BS import qualified Data.ByteString.Lazy as LBS import Data.Either (fromRight) import Data.Int (Int64) import Data.List.Extra (dropEnd, sumOn') import Data.Maybe (fromMaybe) import Data.TreeDiff.Class (ToExpr) import Debug.Trace () import GHC.Generics (Generic) import Network.ByteOrder (bytestring32, bytestring64) import Tahoe.CHK.Merkle (MerkleTree) import Tahoe.CHK.SHA256d (Digest' (Digest'), SHA256d, toBytes) import Tahoe.CHK.Types (ShareNum) import Tahoe.CHK.URIExtension ( URIExtension, pURIExtension, uriExtensionToBytes, ) import Tahoe.Util (chunkedBy, toStrictByteString) import Text.Megaparsec (parse) -- | A byte string of encrypted data. type Crypttext = BS.ByteString -- | Structured representation of a single CHK share. data Share = Share { -- | The ZFEC block size. Legacy value. Unused. _blockSize :: Word64 , -- | The share data length. Legacy value. Unused. _dataSize :: Word64 , -- | The ZFEC encoded ciphertext blocks. _blocks :: [LBS.ByteString] , -- | A merkle tree of plaintext segment hashes. Unimplemented. _plaintextHashTree :: MerkleTree BS.ByteString SHA256d , -- | A merkle tree of ciphertext segment hashes. _crypttextHashTree :: MerkleTree Crypttext SHA256d , -- | A merkle tree of hashes of `shareBlocks`. _blockHashTree :: MerkleTree BS.ByteString SHA256d , -- | The information needed to complete a merkle proof for this share. _neededHashes :: [(ShareNum, Digest' SHA256d)] , -- | Additional metadata about this share. _uriExtension :: URIExtension } deriving (Eq, Ord, Show, Generic, ToExpr) $(makeLenses ''Share) getWord32 :: Get Word64 getWord32 = do word32 <- get :: Get Word32 pure $ fromIntegral word32 getWord64 :: Get Word64 getWord64 = get word64To4Bytes :: Word64 -> Maybe BS.ByteString word64To4Bytes = (bytestring32 <$>) . word64ToWord32 word64To4Bytes' :: Word64 -> Either String BS.ByteString word64To4Bytes' w = case word64To4Bytes w of Nothing -> Left "Word64 out of bounds in conversion to Word32" Just bs -> pure bs word64To8Bytes :: Word64 -> BS.ByteString word64To8Bytes = bytestring64 instance Binary Share where -- Serialize a share to its canonical byte representation. This replaces -- much of allmydata.immutable.layout. put Share{..} = let -- shareDataSize is supposedly unused. Avoid making any -- calculations based on its value. We'll serialize it into -- the output but otherwise we should ignore it. Instead, -- we'll use this computed value that's consistent with the -- rest of our data. -- -- CRSEncoder.set_params realSize = sumOn' LBS.length _blocks -- Pick a share format version based on the size of our data, -- along with helpers to encoding our fields for that format -- version. -- -- Okay we won't completely ignore shareDataSize. We can't -- encode sufficiently large values into a v1 format share so -- switch to v2 format if shareDataSize needs it. -- -- Tahoe also checks blockSize < 2 ^ 32 but I don't see how it is -- possible for blockSize to be greater than dataSize. (version, encodeWord, putWord) = chooseVersion $ max _dataSize (int64ToWord64 realSize) -- This excludes the version but otherwise has all of the integer -- header fields we need to write. header = [ _blockSize , _dataSize , (fromIntegral :: Int -> Word64) headerSize ] <> trailerFieldOffsets -- Compute the header size so we can include it in the offset -- calculation. The header is the 4 byte version field and then some -- additional number of integer fields. Each subsequent integer field -- is either 4 or 8 bytes depending on the share version. headerSize = 4 + fieldSizeForVersion version * length header -- Then compute the offset of each piece of the trailer. They all -- follow the header and all of the share blocks so start there and -- advance by the size of each trailer piece. trailerOffset = (fromIntegral :: Int -> Word64) headerSize + int64ToWord64 realSize -- The scanl would calculate the offset of the field following the -- last field - which we don't need or want. So drop the last size. trailerFieldOffsets = scanl (+) trailerOffset (dropEnd 1 trailerFieldSizes) -- We need to write offets to trailer fields into the header. Compute -- the size of each trailer piece so we know how they'll be laid out. trailerFieldSizes = map (int64ToWord64 . LBS.length) trailerFields -- Construct all of the trailing metadata here so we know how -- big each piece of it is. We need to put offsets pointing -- at this data into the header. Keep in mind that nearby -- code assumes this list contains one element for each -- trailer field which has an offset recorded in the header. -- That code will produce an incorrect header if this -- assumption is violated. ueb = uriExtensionToBytes _uriExtension trailerFields = [ encode _plaintextHashTree , encode _crypttextHashTree , encode _blockHashTree , LBS.fromStrict $ serializeNeededShares _neededHashes , LBS.fromStrict $ encodeWord (intToWord64 $ BS.length ueb) <> ueb ] in do put (fromIntegral version :: Word32) mapM_ putWord header mapM_ putLazyByteString _blocks mapM_ putLazyByteString trailerFields get = do -- Read the version marker to determine the size of certain following -- fields. (_version, getWord) <- getVersion -- 0, 1 _blockSize <- getWord -- 4, 1 _dataSize <- getWord -- 8, 1 -- These offsets are all relative to the beginning of the share. dataOffset <- getWord -- 12, 36 plaintextHashTreeOffset <- getWord -- 16, 37 crypttextHashTreeOffset <- getWord -- 20, 69 blockHashesOffset <- getWord -- 24, 101 shareHashesOffset <- getWord -- 28, 133 uriExtensionLengthOffset <- getWord -- 32, 167 -- Load the rest of the fields in the typical order. The offsets -- might place these fields in a different order but they really -- shouldn't. We'll fail with an explicit error in that case thanks -- to position checking done in getLazyByteStringInBoundsFrom. Then -- we'll fail to load the share but at least we won't apply an invalid -- interpretation to any of the data. allShareBlocks <- getLazyByteStringInBoundsFrom "share blocks" dataOffset plaintextHashTreeOffset -- 36, <1 byte> _plaintextHashTree <- isolateBetween "plaintext hash tree" plaintextHashTreeOffset crypttextHashTreeOffset get -- 37, <69 - 37 == 32 bytes> _crypttextHashTree <- isolateBetween "crypttext hash tree" crypttextHashTreeOffset blockHashesOffset get -- 69, <101 - 69 == 32 bytes> _blockHashTree <- isolateBetween "block hash tree" blockHashesOffset shareHashesOffset get -- 101, <133 - 101 == 32 bytes> _neededHashes <- fromMaybe (fail "Could not parse `needed hashes`") . unserializeNeededShares . LBS.toStrict <$> getLazyByteStringInBoundsFrom "needed shares" shareHashesOffset uriExtensionLengthOffset -- 133, <167 - 133 == 34 bytes> uriExtensionLength <- getWord >>= getInt64FromWord64 "URI extension length" -- 167, uriExtensionBytes <- getLazyByteString uriExtensionLength _uriExtension <- either (fail . show) pure (parse pURIExtension "URI extension" $ LBS.toStrict uriExtensionBytes) let _blocks = segmentLazyBytes (fromIntegral _blockSize) allShareBlocks pure $ Share{..} segmentLazyBytes :: Int64 -> LBS.ByteString -> [LBS.ByteString] segmentLazyBytes _segmentSize "" = [] segmentLazyBytes segmentSize bs = nextSegment : segmentLazyBytes segmentSize theRest where (nextSegment, theRest) = LBS.splitAt segmentSize bs isolateBetween :: String -> Word64 -> Word64 -> Get a -> Get a isolateBetween name start end g = do pos <- bytesRead if (fromIntegral :: Int64 -> Word64) pos /= start then fail $ "expected to read from " <> show start <> " to get " <> name <> " but position is " <> show pos else isolate (fromIntegral (end - start)) g getLazyByteStringInBoundsFrom :: String -> Word64 -> Word64 -> Get LBS.ByteString getLazyByteStringInBoundsFrom name expectedPosition offset = do pos <- bytesRead if (fromIntegral :: Int64 -> Word64) pos /= expectedPosition then fail $ "expected to read from " <> show expectedPosition <> " to get " <> name <> " but position is " <> show pos else do offsetInt64 <- getInt64FromWord64 name offset getLazyByteString (offsetInt64 - pos) getInt64FromWord64 :: String -> Word64 -> Get Int64 getInt64FromWord64 name = maybe (fail $ name <> " out of bounds") pure . word64ToInt64 word64ToInt64 :: Word64 -> Maybe Int64 word64ToInt64 w | w > maxInt64 = Nothing | otherwise = Just (fromIntegral w) where maxInt64 :: Word64 maxInt64 = fromIntegral (maxBound :: Int64) word64ToWord32 :: Word64 -> Maybe Word32 word64ToWord32 w | w > maxWord32 = Nothing | otherwise = Just (fromIntegral w) maxWord32 :: Integral i => i maxWord32 = fromIntegral (maxBound :: Word32) {- | Serialize the list of (share number, block tree root hash) pairs for inclusion in the serialized form of a Share. The inverse of unserializeNeededShares. -} serializeNeededShares :: HashAlgorithm hash => [(ShareNum, Digest' hash)] -> BS.ByteString serializeNeededShares = BS.concat . pieces where pieces [] = [] pieces ((sharenum, hash) : xs) = (toStrictByteString . BS.int16BE . fromIntegral $ sharenum) : toBytes hash : pieces xs {- | Unserialize a a list of (share number, block tree root hash) pairs from their form in a serialized Share. The inverse of serializeNeededShares. -} unserializeNeededShares :: forall hash. HashAlgorithm hash => BS.ByteString -> Maybe [(ShareNum, Digest' hash)] unserializeNeededShares bs = traverse sequenceA result where chunks = chunkedBy (2 + hashDigestSize (undefined :: hash)) bs pairs = map (BS.splitAt 2) chunks result = bimap toShareNum (fmap Digest' . digestFromByteString) <$> pairs toShareNum :: BS.ByteString -> ShareNum toShareNum x = fromIntegral $ fromEnum msb `shiftL` 8 .|. fromEnum lsb where msb = BS.head x lsb = BS.last x intToWord64 :: Int -> Word64 intToWord64 x | x < 0 = error "Negative Int cannot be converted to Word64" | otherwise = fromIntegral x int64ToWord64 :: Int64 -> Word64 int64ToWord64 x | x < 0 = error "Negative Int64 cannot be converted to Word64" | otherwise = fromIntegral x getVersion :: Get (Word8, Get Word64) getVersion = do version <- getWord32 pure ( fromIntegral version , case version of 1 -> getWord32 2 -> getWord64 _ -> fail $ "unsupported version: " <> show version ) chooseVersion :: Word64 -> (Word8, Word64 -> BS.ByteString, Word64 -> Put) chooseVersion shareDataSize = (version, encodeWord, putWord) where -- Version 1 can encode sizes up to 2^32 bytes. Version 2 can encode -- sizes up to 2^64 bytes. Choose a version based on the actual data -- size. We only save a handful bytes of header this way so the extra -- complexity may not be worth it just for that but it's convenient to -- be able to emit either share version for testing. version = if shareDataSize <= maxWord32 then 1 else 2 -- Here's where the version makes a difference to the header size. -- Choose an integer encoding that uses the right number of bytes. encodeWord -- word64To4Bytes can't always succeed but if we're picking version 1 then -- we believe it will succeed. If it fails, we'll have to have a hard -- error :/ This is not ideal but ... I dunno what to do. | version == 1 = word64To4BytesPartial | version == 2 = word64To8Bytes | otherwise = error $ "unsupported version: " <> show version putWord = putLazyByteString . LBS.fromStrict . encodeWord fieldSizeForVersion :: Word8 -> Int fieldSizeForVersion 1 = 4 fieldSizeForVersion 2 = 8 fieldSizeForVersion n = error $ "Unsupported version number: " <> show n {- | Serialize a Word64 to 4 bytes or throw an exception if the value can not fit. -} word64To4BytesPartial :: Word64 -> BS.ByteString word64To4BytesPartial i = fromRight (throw $ Word64OutOfBounds i) (word64To4Bytes' i) newtype EncodingError = Word64OutOfBounds Word64 deriving (Eq, Ord, Show) instance Exception EncodingError