{-# LANGUAGE DefaultSignatures #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE UnboxedTuples #-} -- | 'ContentHashable' provides a hashing function suitable for use in the -- Funflow content store. -- -- This behaves as does a normal hashing function on Haskell types. However, -- on path types, this instead calculates a hash based on the contents of the -- file or directory referenced. -- -- We also export the 'ExternallyAssuredFile' and 'ExternallyAssuredDirectory' -- types. These instead use the path, file size and modification time to control -- the hash. module Data.CAS.ContentHashable ( ContentHash , toBytes , fromBytes , ContentHashable (..) , contentHashUpdate_binaryFile , contentHashUpdate_byteArray# , contentHashUpdate_fingerprint , contentHashUpdate_primitive , contentHashUpdate_storable , FileContent (..) , DirectoryContent (..) , ExternallyAssuredFile(..) , ExternallyAssuredDirectory(..) , encodeHash , decodeHash , hashToPath , pathToHash , SHA256 , Context , Digest ) where import Control.Exception.Safe (catchJust) import Control.Monad (foldM, (>=>)) import Control.Monad.IO.Class (MonadIO, liftIO) import Crypto.Hash (Context, Digest, SHA256, digestFromByteString, hashFinalize, hashInit, hashUpdate) import qualified Data.Aeson as Aeson import qualified Data.Aeson.Types as Aeson import Data.Bits (shiftL) import Data.ByteArray (Bytes, MemView (MemView), allocAndFreeze, convert) import Data.ByteArray.Encoding (Base (Base16), convertFromBase, convertToBase) import qualified Data.ByteString as BS import Data.ByteString.Builder.Extra (defaultChunkSize) import qualified Data.ByteString.Char8 as C8 import qualified Data.ByteString.Lazy as BSL import Data.CAS.StoreOrphans () import Data.Foldable (foldlM) import Data.Functor.Contravariant import qualified Data.Hashable import qualified Data.HashMap.Lazy as HashMap import qualified Data.HashSet as HashSet import Data.Int import Data.List (sort) import Data.List.NonEmpty (NonEmpty) import Data.Map (Map) import qualified Data.Map as Map import Data.Ratio import Data.Scientific import Data.Store (Store (..), peekException) import qualified Data.Text as T import qualified Data.Text.Array as TA import qualified Data.Text.Encoding as TE import qualified Data.Text.Internal as T import qualified Data.Text.Lazy as TL import Data.Time.Clock (UTCTime) import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds) import Data.Typeable import qualified Data.Vector as V import Data.Word import Foreign.Marshal.Utils (with) import Foreign.Ptr (castPtr) import Foreign.Storable (Storable, sizeOf) import GHC.Fingerprint import GHC.Generics import GHC.Integer.GMP.Internals (BigNat (..), Integer (..)) import GHC.Natural (Natural (..)) import GHC.Prim (ByteArray#, copyByteArrayToAddr#, sizeofByteArray#) import GHC.Ptr (Ptr (Ptr)) import GHC.Types (IO (IO), Int (I#), Word (W#)) import qualified Path import qualified Path.Internal import qualified Path.IO import System.IO (IOMode (ReadMode), withBinaryFile) import System.IO.Error (isPermissionError) import System.IO.Unsafe (unsafePerformIO) import System.Posix.Files (fileSize, getFileStatus) newtype ContentHash = ContentHash { unContentHash :: Digest SHA256 } deriving (Eq, Ord, Generic) instance Aeson.FromJSON ContentHash where parseJSON (Aeson.String s) | Just h <- decodeHash (TE.encodeUtf8 s) = pure h | otherwise = fail "Invalid hash encoding" parseJSON invalid = Aeson.typeMismatch "ContentHash" invalid instance Aeson.ToJSON ContentHash where toJSON = Aeson.String . TE.decodeUtf8 . encodeHash instance Data.Hashable.Hashable ContentHash where hashWithSalt s = Data.Hashable.hashWithSalt s . encodeHash instance Show ContentHash where showsPrec d h = showParen (d > app_prec) $ showString "ContentHash \"" . (showString $ C8.unpack $ encodeHash h) . showString "\"" where app_prec = 10 instance Store ContentHash where size = contramap toBytes size peek = fromBytes <$> peek >>= \case Nothing -> peekException "Store ContentHash: Illegal digest" Just x -> return x poke = poke . toBytes toBytes :: ContentHash -> BS.ByteString toBytes = convert . unContentHash fromBytes :: BS.ByteString -> Maybe ContentHash fromBytes bs = ContentHash <$> digestFromByteString bs hashEncoding :: Base hashEncoding = Base16 -- | File path appropriate encoding of a hash encodeHash :: ContentHash -> BS.ByteString encodeHash = convertToBase hashEncoding . toBytes -- | Inverse of 'encodeHash' if given a valid input. -- -- prop> decodeHash (encodeHash x) = Just x decodeHash :: BS.ByteString -> Maybe ContentHash decodeHash bs = case convertFromBase hashEncoding bs of Left _ -> Nothing Right x -> fromBytes x -- | File path appropriate encoding of a hash hashToPath :: ContentHash -> Path.Path Path.Rel Path.Dir hashToPath h = case Path.parseRelDir $ C8.unpack $ encodeHash h of Nothing -> error "[ContentHashable.hashToPath] \ \Failed to convert hash to directory name" Just dir -> dir -- | Inverse of 'hashToPath' if given a valid input. -- -- prop> pathToHash (hashToPath x) = Just x pathToHash :: FilePath -> Maybe ContentHash pathToHash = decodeHash . C8.pack class Monad m => ContentHashable m a where -- | Update a hash context based on the given value. -- -- See 'Crypto.Hash.hashUpdate'. -- -- XXX: Consider swapping the arguments. contentHashUpdate :: Context SHA256 -> a -> m (Context SHA256) default contentHashUpdate :: (Generic a, GContentHashable m (Rep a)) => Context SHA256 -> a -> m (Context SHA256) contentHashUpdate ctx a = gContentHashUpdate ctx (from a) -- | Generate hash of the given value. -- -- See 'Crypto.Hash.hash'. contentHash :: a -> m ContentHash contentHash x = ContentHash . hashFinalize <$> contentHashUpdate hashInit x -- | Update hash context based on binary in memory representation due to 'Foreign.Storable.Storable'. -- -- XXX: Do we need to worry about endianness? contentHashUpdate_storable :: (Monad m, Storable a) => Context SHA256 -> a -> m (Context SHA256) contentHashUpdate_storable ctx a = return . unsafePerformIO $ with a (\p -> pure $! hashUpdate ctx (MemView (castPtr p) (sizeOf a))) -- | Update hash context based on a type's 'GHC.Fingerprint.Type.Fingerprint'. -- -- The fingerprint is constructed from the library-name, module-name, and name of the type itself. contentHashUpdate_fingerprint :: (Monad m, Typeable a) => Context SHA256 -> a -> m (Context SHA256) contentHashUpdate_fingerprint ctx = contentHashUpdate ctx . typeRepFingerprint . typeOf -- | Update hash context by combining 'contentHashUpdate_fingerprint' and 'contentHashUpdate_storable'. -- Intended for primitive types like 'Int'. contentHashUpdate_primitive :: (Monad m, Typeable a, Storable a) => Context SHA256 -> a -> m (Context SHA256) contentHashUpdate_primitive ctx a = flip contentHashUpdate_fingerprint a >=> flip contentHashUpdate_storable a $ ctx -- | Update hash context based on binary contents of the given file. contentHashUpdate_binaryFile :: Context SHA256 -> FilePath -> IO (Context SHA256) contentHashUpdate_binaryFile ctx0 fp = withBinaryFile fp ReadMode $ \h -> let go ctx = do chunk <- BS.hGetSome h defaultChunkSize if BS.null chunk then pure ctx else go $! hashUpdate ctx chunk in go ctx0 -- | Update hash context based on 'GHC.Prim.ByteArray#' -- by copying into a newly allocated 'Data.ByteArray.Bytes' -- and updating the hash context from there. -- -- XXX: @'GHC.Prim.byteArrayContents#' :: 'GHC.Prim.ByteArray#' -> 'GHC.Prim.Addr#'@ -- could be used together with 'Data.ByteArray.MemView' instead. -- However, 'GHC.Prim.byteArrayContents#' explicitly says, that it is only safe to use -- on a pinned 'GHC.Prim.ByteArray#'. contentHashUpdate_byteArray# :: ByteArray# -> Int -> Int -> Context SHA256 -> Context SHA256 contentHashUpdate_byteArray# ba (I# off) (I# len) ctx = hashUpdate ctx $ allocAndFreeze @Bytes (I# len) $ \(Ptr addr) -> IO $ \s -> (# copyByteArrayToAddr# ba off addr len s, () #) -- | Update hash context based on the contents of a strict 'Data.Text.Text'. contentHashUpdate_text :: Context SHA256 -> T.Text -> Context SHA256 contentHashUpdate_text ctx (T.Text arr off_ len_) = contentHashUpdate_byteArray# (TA.aBA arr) off len ctx where off = off_ `shiftL` 1 -- convert from 'Word16' to 'Word8' len = len_ `shiftL` 1 -- convert from 'Word16' to 'Word8' instance Monad m => ContentHashable m Fingerprint where contentHashUpdate ctx (Fingerprint a b) = flip contentHashUpdate_storable a >=> flip contentHashUpdate_storable b $ ctx instance Monad m => ContentHashable m Bool where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Char where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Int where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Int8 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Int16 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Int32 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Int64 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Word where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Word8 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Word16 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Word32 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Word64 where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Float where contentHashUpdate = contentHashUpdate_primitive instance Monad m => ContentHashable m Double where contentHashUpdate = contentHashUpdate_primitive instance (ContentHashable m n, Typeable n) => ContentHashable m (Ratio n) where contentHashUpdate ctx x = flip contentHashUpdate_fingerprint x >=> flip contentHashUpdate (numerator x) >=> flip contentHashUpdate (denominator x) $ ctx instance Monad m => ContentHashable m Scientific where contentHashUpdate ctx x = flip contentHashUpdate_fingerprint x >=> flip contentHashUpdate (toRational x) $ ctx instance Monad m => ContentHashable m Integer where contentHashUpdate ctx n = ($ ctx) $ flip contentHashUpdate_fingerprint n >=> case n of S# i -> pure . flip hashUpdate (C8.pack "S") -- tag constructur >=> flip contentHashUpdate_storable (I# i) -- hash field Jp# (BN# ba) -> pure . flip hashUpdate (C8.pack "L") -- tag constructur >=> pure . contentHashUpdate_byteArray# ba 0 (I# (sizeofByteArray# ba)) -- hash field Jn# (BN# ba) -> pure . flip hashUpdate (C8.pack "N") -- tag constructur >=> pure . contentHashUpdate_byteArray# ba 0 (I# (sizeofByteArray# ba)) -- hash field instance Monad m => ContentHashable m Natural where contentHashUpdate ctx n = ($ ctx) $ flip contentHashUpdate_fingerprint n >=> case n of NatS# w -> pure . flip hashUpdate (C8.pack "S") -- tag constructur >=> flip contentHashUpdate_storable (W# w) -- hash field NatJ# (BN# ba) -> pure . flip hashUpdate (C8.pack "L") -- tag constructur >=> pure . contentHashUpdate_byteArray# ba 0 (I# (sizeofByteArray# ba)) -- hash field instance Monad m => ContentHashable m BS.ByteString where contentHashUpdate ctx s = flip contentHashUpdate_fingerprint s >=> pure . flip hashUpdate s $ ctx instance Monad m => ContentHashable m BSL.ByteString where contentHashUpdate ctx s = flip contentHashUpdate_fingerprint s >=> pure . flip (BSL.foldlChunks hashUpdate) s $ ctx instance Monad m => ContentHashable m T.Text where contentHashUpdate ctx s = flip contentHashUpdate_fingerprint s >=> pure . flip contentHashUpdate_text s $ ctx instance Monad m => ContentHashable m TL.Text where contentHashUpdate ctx s = flip contentHashUpdate_fingerprint s >=> pure . flip (TL.foldlChunks contentHashUpdate_text) s $ ctx instance (Typeable k, Typeable v, ContentHashable m k, ContentHashable m v) => ContentHashable m (Map k v) where contentHashUpdate ctx m = flip contentHashUpdate_fingerprint m >=> flip contentHashUpdate (Map.toList m) $ ctx instance (Typeable k, Typeable v, ContentHashable m k, ContentHashable m v) => ContentHashable m (HashMap.HashMap k v) where contentHashUpdate ctx m = flip contentHashUpdate_fingerprint m -- XXX: The order of the list is unspecified. >=> flip contentHashUpdate (HashMap.toList m) $ ctx instance (Typeable v, ContentHashable m v) => ContentHashable m (HashSet.HashSet v) where contentHashUpdate ctx s = flip contentHashUpdate_fingerprint s -- XXX: The order of the list is unspecified. >=> flip contentHashUpdate (HashSet.toList s) $ ctx instance (Typeable a, ContentHashable m a) => ContentHashable m [a] where contentHashUpdate ctx l = flip contentHashUpdate_fingerprint l >=> flip (foldM contentHashUpdate) l $ ctx instance (Typeable a, ContentHashable m a) => ContentHashable m (NonEmpty a) where contentHashUpdate ctx l = flip contentHashUpdate_fingerprint l >=> flip (foldlM contentHashUpdate) l $ ctx instance (Typeable a, ContentHashable m a) => ContentHashable m (V.Vector a) where contentHashUpdate ctx v = flip contentHashUpdate_fingerprint v >=> flip (V.foldM' contentHashUpdate) v $ ctx instance Monad m => ContentHashable m () instance (ContentHashable m a, ContentHashable m b) => ContentHashable m (a, b) instance (ContentHashable m a, ContentHashable m b, ContentHashable m c) => ContentHashable m (a, b, c) instance (ContentHashable m a, ContentHashable m b, ContentHashable m c, ContentHashable m d) => ContentHashable m (a, b, c, d) instance (ContentHashable m a, ContentHashable m b, ContentHashable m c, ContentHashable m d, ContentHashable m e) => ContentHashable m (a, b, c, d, e) instance (Monad m, ContentHashable m a, ContentHashable m b, ContentHashable m c, ContentHashable m d, ContentHashable m e, ContentHashable m f) => ContentHashable m (a, b, c, d, e, f) instance (Monad m, ContentHashable m a, ContentHashable m b, ContentHashable m c, ContentHashable m d, ContentHashable m e, ContentHashable m f, ContentHashable m g) => ContentHashable m (a, b, c, d, e, f, g) instance ContentHashable m a => ContentHashable m (Maybe a) instance (ContentHashable m a, ContentHashable m b) => ContentHashable m (Either a b) instance Monad m => ContentHashable m Aeson.Value class Monad m => GContentHashable m f where gContentHashUpdate :: Context SHA256 -> f a -> m (Context SHA256) instance Monad m => GContentHashable m V1 where gContentHashUpdate ctx _ = pure ctx instance Monad m => GContentHashable m U1 where gContentHashUpdate ctx U1 = pure ctx instance ContentHashable m c => GContentHashable m (K1 i c) where gContentHashUpdate ctx x = contentHashUpdate ctx (unK1 x) instance (Constructor c, GContentHashable m f) => GContentHashable m (C1 c f) where gContentHashUpdate ctx0 x = gContentHashUpdate nameCtx (unM1 x) where nameCtx = hashUpdate ctx0 $ C8.pack (conName x) instance (Datatype d, GContentHashable m f) => GContentHashable m (D1 d f) where gContentHashUpdate ctx0 x = gContentHashUpdate packageCtx (unM1 x) where datatypeCtx = hashUpdate ctx0 $ C8.pack (datatypeName x) moduleCtx = hashUpdate datatypeCtx $ C8.pack (datatypeName x) packageCtx = hashUpdate moduleCtx $ C8.pack (datatypeName x) instance GContentHashable m f => GContentHashable m (S1 s f) where gContentHashUpdate ctx x = gContentHashUpdate ctx (unM1 x) instance (GContentHashable m a, GContentHashable m b) => GContentHashable m (a :*: b) where gContentHashUpdate ctx (x :*: y) = gContentHashUpdate ctx x >>= flip gContentHashUpdate y instance (GContentHashable m a, GContentHashable m b) => GContentHashable m (a :+: b) where gContentHashUpdate ctx (L1 x) = gContentHashUpdate ctx x gContentHashUpdate ctx (R1 x) = gContentHashUpdate ctx x -- XXX: Do we need this? -- instance GContentHashable (a :.: b) where -- gContentHashUpdate ctx x = _ (unComp1 x) instance (Monad m, Typeable b, Typeable t) => ContentHashable m (Path.Path b t) where contentHashUpdate ctx p@(Path.Internal.Path fp) = flip contentHashUpdate_fingerprint p >=> flip contentHashUpdate fp $ ctx -- | Path to a regular file -- -- Only the file's content and its executable permission is taken into account -- when generating the content hash. The path itself is ignored. newtype FileContent = FileContent (Path.Path Path.Abs Path.File) instance ContentHashable IO FileContent where contentHashUpdate ctx (FileContent fp) = do exec <- Path.IO.executable <$> Path.IO.getPermissions fp ctx' <- if exec then contentHashUpdate ctx () else pure ctx contentHashUpdate_binaryFile ctx' (Path.fromAbsFile fp) -- | Path to a directory -- -- Only the contents of the directory and their path relative to the directory -- are taken into account when generating the content hash. -- The path to the directory is ignored. newtype DirectoryContent = DirectoryContent (Path.Path Path.Abs Path.Dir) instance MonadIO m => ContentHashable m DirectoryContent where contentHashUpdate ctx0 (DirectoryContent dir0) = liftIO $ do (dirs, files) <- Path.IO.listDir dir0 ctx' <- foldM hashFile ctx0 (sort files) foldM hashDir ctx' (sort dirs) where hashFile ctx fp = -- XXX: Do we need to treat symbolic links specially? flip contentHashUpdate (Path.filename fp) >=> flip contentHashUpdate (FileContent fp) $ ctx hashDir ctx dir = flip contentHashUpdate (Path.dirname dir) >=> flip contentHashUpdate (DirectoryContent dir) $ ctx instance Monad m => ContentHashable m UTCTime where contentHashUpdate ctx utcTime = let secondsSinceEpoch = fromEnum . utcTimeToPOSIXSeconds $ utcTime in flip contentHashUpdate_fingerprint utcTime >=> flip contentHashUpdate secondsSinceEpoch $ ctx -- | Path to a file to be treated as _externally assured_. -- -- An externally assured file is handled in a somewhat 'cheating' way by -- funflow. The 'ContentHashable' instance for such assumes that some external -- agent guarantees the integrity of the file being referenced. Thus, rather -- than hashing the file contents, we only consider its (absolute) path, size and -- modification time, which can be rapidly looked up from filesystem metadata. -- -- For a similar approach, see the instance for 'ObjectInBucket' in -- Control.Funflow.AWS.S3, where we exploit the fact that S3 is already -- content hashed to avoid performing any hashing. newtype ExternallyAssuredFile = ExternallyAssuredFile (Path.Path Path.Abs Path.File) deriving (Generic, Show) instance Aeson.FromJSON ExternallyAssuredFile instance Aeson.ToJSON ExternallyAssuredFile instance Store ExternallyAssuredFile instance ContentHashable IO ExternallyAssuredFile where contentHashUpdate ctx (ExternallyAssuredFile fp) = do modTime <- Path.IO.getModificationTime fp fSize <- fileSize <$> getFileStatus (Path.toFilePath fp) flip contentHashUpdate fp >=> flip contentHashUpdate modTime >=> flip contentHashUpdate_storable fSize $ ctx -- | Path to a directory to be treated as _externally assured_. -- -- For an externally assured directory, we _do_ traverse its contents and verify -- those as we would externally assured files, rather than just relying on the -- directory path. Doing this traversal is pretty cheap, and it's quite likely -- for directory contents to be modified without modifying the contents. -- -- If an item in the directory cannot be read due to lacking permissions, -- then it will be ignored and not included in the hash. If the flow does not -- have permissions to access the contents of a subdirectory, then these -- contents cannot influence the outcome of a task and it is okay to exclude -- them from the hash. In that case we only hash the name, as that could -- influence the outcome of a task. newtype ExternallyAssuredDirectory = ExternallyAssuredDirectory (Path.Path Path.Abs Path.Dir) deriving (Generic, Show) instance Aeson.FromJSON ExternallyAssuredDirectory instance Aeson.ToJSON ExternallyAssuredDirectory instance Store ExternallyAssuredDirectory instance ContentHashable IO ExternallyAssuredDirectory where contentHashUpdate ctx0 (ExternallyAssuredDirectory dir0) = do -- Note that we don't bother looking at the relative directory paths and -- including these in the hash. This is because the absolute hash gets -- included every time we hash a file. (dirs, files) <- Path.IO.listDir dir0 ctx' <- foldM hashFile ctx0 (sort files) foldM hashDir ctx' (sort dirs) where hashFile ctx fp = contentHashUpdate ctx (ExternallyAssuredFile fp) `catchPermissionError` \_ -> contentHashUpdate ctx fp hashDir ctx dir = contentHashUpdate ctx (ExternallyAssuredDirectory dir) `catchPermissionError` \_ -> contentHashUpdate ctx dir catchPermissionError = catchJust $ \e -> if isPermissionError e then Just e else Nothing